From 6c55401037f9ff01170e3979dca51f6b2a9a8293 Mon Sep 17 00:00:00 2001 From: simonmar Date: Fri, 1 Apr 2005 12:14:30 +0000 Subject: [PATCH] [project @ 2005-04-01 12:14:29 by simonmar] First cut at the x86_64 native code generator. Lots of code is shared with i386, but floating point uses SSE2. This more or less works, the things I know that don't work are: - the floating-point primitives (sin, cos etc.) are missing - floating-point comparisons involving NaN are wrong - there's no PIC support yet Also, I have a long list of small things to fix up to improve performance. I think the small memory model is assumed, for now. --- ghc/compiler/cmm/Cmm.hs | 1 + ghc/compiler/nativeGen/AsmCodeGen.lhs | 37 +- ghc/compiler/nativeGen/MachCodeGen.hs | 652 ++++++++++++++++++++++++++------ ghc/compiler/nativeGen/MachInstrs.hs | 106 +++--- ghc/compiler/nativeGen/MachRegs.lhs | 160 +++++++- ghc/compiler/nativeGen/NCG.h | 6 + ghc/compiler/nativeGen/PprMach.hs | 284 +++++++++++--- ghc/compiler/nativeGen/RegAllocInfo.hs | 62 ++- 8 files changed, 1050 insertions(+), 258 deletions(-) diff --git a/ghc/compiler/cmm/Cmm.hs b/ghc/compiler/cmm/Cmm.hs index aa92e01..a8576ec 100644 --- a/ghc/compiler/cmm/Cmm.hs +++ b/ghc/compiler/cmm/Cmm.hs @@ -241,6 +241,7 @@ data Section | ReadOnlyData | RelocatableReadOnlyData | UninitialisedData + | ReadOnlyData16 -- .rodata.cst16 on x86_64, 16-byte aligned | OtherSection String data CmmStatic diff --git a/ghc/compiler/nativeGen/AsmCodeGen.lhs b/ghc/compiler/nativeGen/AsmCodeGen.lhs index e790991..2675a26 100644 --- a/ghc/compiler/nativeGen/AsmCodeGen.lhs +++ b/ghc/compiler/nativeGen/AsmCodeGen.lhs @@ -552,12 +552,37 @@ cmmMachOpFold op arg@[CmmLit (CmmInt x rep)] cmmMachOpFold (MO_S_Conv rep1 rep2) [x] | rep1 == rep2 = x cmmMachOpFold (MO_U_Conv rep1 rep2) [x] | rep1 == rep2 = x --- ToDo: eliminate multiple conversions. Be careful though: can't remove --- a narrowing, and can't remove conversions to/from floating point types. - --- ToDo: eliminate nested comparisons: --- CmmMachOp MO_Lt [CmmMachOp MO_Eq [x,y], CmmLit (CmmInt 0 _)] --- turns into a simple equality test. +-- Eliminate nested conversions where possible +cmmMachOpFold conv_outer args@[CmmMachOp conv_inner [x]] + | Just (rep1,rep2,signed1) <- isIntConversion conv_inner, + Just (_, rep3,signed2) <- isIntConversion conv_outer + = case () of + -- widen then narrow to the same size is a nop + _ | rep1 < rep2 && rep1 == rep3 -> x + -- Widen then narrow to different size: collapse to single conversion + -- but remember to use the signedness from the widening, just in case + -- the final conversion is a widen. + | rep1 < rep2 && rep2 > rep3 -> + cmmMachOpFold (intconv signed1 rep1 rep3) [x] + -- Nested widenings: collapse if the signedness is the same + | rep1 < rep2 && rep2 < rep3 && signed1 == signed2 -> + cmmMachOpFold (intconv signed1 rep1 rep3) [x] + -- Nested narrowings: collapse + | rep1 > rep2 && rep2 > rep3 -> + cmmMachOpFold (MO_U_Conv rep1 rep3) [x] + | otherwise -> + CmmMachOp conv_outer args + where + isIntConversion (MO_U_Conv rep1 rep2) = Just (rep1,rep2,False) + isIntConversion (MO_S_Conv rep1 rep2) = Just (rep1,rep2,True) + isIntConversion _ = Nothing + + intconv True = MO_S_Conv + intconv False = MO_U_Conv + +-- ToDo: a narrow of a load can be collapsed into a narrow load, right? +-- but what if the architecture only supports word-sized loads, should +-- we do the transformation anyway? cmmMachOpFold mop args@[CmmLit (CmmInt x xrep), CmmLit (CmmInt y _)] = case mop of diff --git a/ghc/compiler/nativeGen/MachCodeGen.hs b/ghc/compiler/nativeGen/MachCodeGen.hs index 35e0105..c93b678 100644 --- a/ghc/compiler/nativeGen/MachCodeGen.hs +++ b/ghc/compiler/nativeGen/MachCodeGen.hs @@ -37,6 +37,7 @@ import Outputable import qualified Outputable import FastString import FastTypes ( isFastTrue ) +import Constants ( wORD_SIZE ) #ifdef DEBUG import Outputable ( assertPanic ) @@ -102,15 +103,17 @@ stmtToInstrs stmt = case stmt of CmmAssign reg src | isFloatingRep kind -> assignReg_FltCode kind reg src - | wordRep == I32 && kind == I64 - -> assignReg_I64Code reg src +#if WORD_SIZE_IN_BITS==32 + | kind == I64 -> assignReg_I64Code reg src +#endif | otherwise -> assignReg_IntCode kind reg src where kind = cmmRegRep reg CmmStore addr src | isFloatingRep kind -> assignMem_FltCode kind addr src - | wordRep == I32 && kind == I64 - -> assignMem_I64Code addr src +#if WORD_SIZE_IN_BITS==32 + | kind == I64 -> assignMem_I64Code addr src +#endif | otherwise -> assignMem_IntCode kind addr src where kind = cmmExprRep src @@ -157,9 +160,14 @@ data ChildCode64 -- a.k.a "Register64" -- selection game are therefore that the returned -- Reg may be modified +#if WORD_SIZE_IN_BITS==32 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock +#endif + +#ifndef x86_64_TARGET_ARCH iselExpr64 :: CmmExpr -> NatM ChildCode64 +#endif -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -463,6 +471,21 @@ swizzleRegisterRep (Any _ codefn) rep = Any rep codefn -- ----------------------------------------------------------------------------- +-- Utils based on getRegister, below + +-- The dual to getAnyReg: compute an expression into a register, but +-- we don't mind which one it is. +getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) +getSomeReg expr = do + r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed _ reg code -> + return (reg, code) + +-- ----------------------------------------------------------------------------- -- Grab the Reg for a CmmReg getRegisterReg :: CmmReg -> Reg @@ -769,6 +792,31 @@ getRegister (CmmLit (CmmFloat d F64)) -- in return (Any F64 code) +#endif /* i386_TARGET_ARCH */ + +#if x86_64_TARGET_ARCH + +getRegister (CmmLit (CmmFloat 0.0 rep)) = do + let code dst = unitOL (XOR rep (OpReg dst) (OpReg dst)) + -- I don't know why there are xorpd, xorps, and pxor instructions. + -- They all appear to do the same thing --SDM + return (Any rep code) + +getRegister (CmmLit (CmmFloat f rep)) = do + lbl <- getNewLabelNat + let code dst = toOL [ + LDATA ReadOnlyData + [CmmDataLabel lbl, + CmmStaticLit (CmmFloat f rep)], + MOV rep (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst) + -- ToDo: should use %rip-relative + ] + -- in + return (Any rep code) + +#endif /* x86_64_TARGET_ARCH */ + +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- catch simple cases of zero- or sign-extended load getRegister (CmmMachOp (MO_U_Conv I8 I32) [CmmLoad addr _]) = do @@ -787,11 +835,86 @@ getRegister (CmmMachOp (MO_S_Conv I16 I32) [CmmLoad addr _]) = do code <- intLoadCode (MOVSxL I16) addr return (Any I32 code) +#endif + +#if x86_64_TARGET_ARCH + +-- catch simple cases of zero- or sign-extended load +getRegister (CmmMachOp (MO_U_Conv I8 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOVZxL I8) addr + return (Any I64 code) + +getRegister (CmmMachOp (MO_S_Conv I8 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOVSxL I8) addr + return (Any I64 code) + +getRegister (CmmMachOp (MO_U_Conv I16 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOVZxL I16) addr + return (Any I64 code) + +getRegister (CmmMachOp (MO_S_Conv I16 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOVSxL I16) addr + return (Any I64 code) + +getRegister (CmmMachOp (MO_U_Conv I32 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOV I32) addr -- 32-bit loads zero-extend + return (Any I64 code) + +getRegister (CmmMachOp (MO_S_Conv I32 I64) [CmmLoad addr _]) = do + code <- intLoadCode (MOVSxL I32) addr + return (Any I64 code) + +#endif + +#if x86_64_TARGET_ARCH +getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do + lbl <- getNewLabelNat + let + code dst = toOL [ + -- This is how gcc does it, so it can't be that bad: + LDATA ReadOnlyData16 [ + CmmAlign 16, + CmmDataLabel lbl, + CmmStaticLit (CmmInt 0x80000000 I32), + CmmStaticLit (CmmInt 0 I32), + CmmStaticLit (CmmInt 0 I32), + CmmStaticLit (CmmInt 0 I32) + ], + XOR F32 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst) + -- xorps, so we need the 128-bit constant + -- ToDo: rip-relative + ] + -- + return (Any F32 code) + +getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do + lbl <- getNewLabelNat + let + -- This is how gcc does it, so it can't be that bad: + code dst = toOL [ + LDATA ReadOnlyData16 [ + CmmAlign 16, + CmmDataLabel lbl, + CmmStaticLit (CmmInt 0x8000000000000000 I64), + CmmStaticLit (CmmInt 0 I64) + ], + -- gcc puts an unpck here. Wonder if we need it. + XOR F64 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst) + -- xorpd, so we need the 128-bit constant + -- ToDo: rip-relative + ] + -- + return (Any F64 code) +#endif + +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH getRegister (CmmMachOp mop [x]) -- unary MachOps = case mop of +#if i386_TARGET_ARCH MO_S_Neg F32 -> trivialUFCode F32 (GNEG F32) x MO_S_Neg F64 -> trivialUFCode F64 (GNEG F64) x +#endif MO_S_Neg rep -> trivialUCode rep (NEGI rep) x MO_Not rep -> trivialUCode rep (NOT rep) x @@ -805,6 +928,15 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps MO_S_Conv I16 I8 -> conversionNop I16 x MO_U_Conv I32 I16 -> conversionNop I32 x MO_S_Conv I32 I16 -> conversionNop I32 x +#if x86_64_TARGET_ARCH + MO_U_Conv I64 I32 -> conversionNop I64 x + MO_S_Conv I64 I32 -> conversionNop I64 x + MO_U_Conv I64 I16 -> conversionNop I64 x + MO_S_Conv I64 I16 -> conversionNop I64 x + MO_U_Conv I64 I8 -> conversionNop I64 x + MO_S_Conv I64 I8 -> conversionNop I64 x +#endif + MO_U_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x MO_S_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x @@ -817,12 +949,32 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps MO_S_Conv I16 I32 -> integerExtend I16 I32 MOVSxL x MO_S_Conv I8 I16 -> integerExtend I8 I16 MOVSxL x +#if x86_64_TARGET_ARCH + MO_U_Conv I8 I64 -> integerExtend I8 I64 MOVZxL x + MO_U_Conv I16 I64 -> integerExtend I16 I64 MOVZxL x + MO_U_Conv I32 I64 -> integerExtend I32 I64 MOVZxL x + MO_S_Conv I8 I64 -> integerExtend I8 I64 MOVSxL x + MO_S_Conv I16 I64 -> integerExtend I16 I64 MOVSxL x + MO_S_Conv I32 I64 -> integerExtend I32 I64 MOVSxL x + -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl. + -- However, we don't want the register allocator to throw it + -- away as an unnecessary reg-to-reg move, so we keep it in + -- the form of a movzl and print it as a movl later. +#endif + +#if i386_TARGET_ARCH MO_S_Conv F32 F64 -> conversionNop F64 x MO_S_Conv F64 F32 -> conversionNop F32 x +#else + MO_S_Conv F32 F64 -> coerceFP2FP F64 x + MO_S_Conv F64 F32 -> coerceFP2FP F32 x +#endif + MO_S_Conv from to | isFloatingRep from -> coerceFP2Int from to x | isFloatingRep to -> coerceInt2FP from to x + other -> pprPanic "getRegister" (pprMachOp mop) where -- signed or unsigned extension. integerExtend from to instr expr = do @@ -869,14 +1021,27 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps MO_U_Lt rep -> condIntReg LU x y MO_U_Le rep -> condIntReg LEU x y - MO_Add F32 -> trivialFCode F32 GADD x y - MO_Sub F32 -> trivialFCode F32 GSUB x y +#if i386_TARGET_ARCH + MO_Add F32 -> trivialFCode F32 GADD x y + MO_Sub F32 -> trivialFCode F32 GSUB x y MO_Add F64 -> trivialFCode F64 GADD x y MO_Sub F64 -> trivialFCode F64 GSUB x y - MO_S_Quot F32 -> trivialFCode F32 GDIV x y + MO_S_Quot F32 -> trivialFCode F32 GDIV x y MO_S_Quot F64 -> trivialFCode F64 GDIV x y +#endif + +#if x86_64_TARGET_ARCH + MO_Add F32 -> trivialFCode F32 ADD x y + MO_Sub F32 -> trivialFCode F32 SUB x y + + MO_Add F64 -> trivialFCode F64 ADD x y + MO_Sub F64 -> trivialFCode F64 SUB x y + + MO_S_Quot F32 -> trivialFCode F32 FDIV x y + MO_S_Quot F64 -> trivialFCode F64 FDIV x y +#endif MO_Add rep -> add_code rep x y MO_Sub rep -> sub_code rep x y @@ -886,8 +1051,16 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps MO_U_Quot rep -> div_code rep False True x y MO_U_Rem rep -> div_code rep False False x y +#if i386_TARGET_ARCH MO_Mul F32 -> trivialFCode F32 GMUL x y MO_Mul F64 -> trivialFCode F64 GMUL x y +#endif + +#if x86_64_TARGET_ARCH + MO_Mul F32 -> trivialFCode F32 MUL x y + MO_Mul F64 -> trivialFCode F64 MUL x y +#endif + MO_Mul rep -> let op = IMUL rep in trivialCode rep op (Just op) x y @@ -912,24 +1085,24 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps where -------------------- imulMayOflo :: MachRep -> CmmExpr -> CmmExpr -> NatM Register - imulMayOflo I32 a b = do - res_lo <- getNewRegNat I32 - res_hi <- getNewRegNat I32 + imulMayOflo rep a b = do + res_lo <- getNewRegNat rep + res_hi <- getNewRegNat rep (a_reg, a_code) <- getNonClobberedReg a (b_reg, b_code) <- getSomeReg b let code dst = a_code `appOL` b_code `appOL` toOL [ - MOV I32 (OpReg a_reg) (OpReg res_hi), - MOV I32 (OpReg b_reg) (OpReg res_lo), + MOV rep (OpReg a_reg) (OpReg res_hi), + MOV rep (OpReg b_reg) (OpReg res_lo), IMUL64 res_hi res_lo, -- result in res_hi:res_lo - SAR I32 (OpImm (ImmInt 31)) (OpReg res_lo), -- sign extend lower part - SUB I32 (OpReg res_hi) (OpReg res_lo), -- compare against upper - MOV I32 (OpReg res_lo) (OpReg dst) + SAR rep (OpImm (ImmInt 31)) (OpReg res_lo), -- sign extend lower part + SUB rep (OpReg res_hi) (OpReg res_lo), -- compare against upper + MOV rep (OpReg res_lo) (OpReg dst) -- dst==0 if high part == sign extended low part ] -- in - return (Any I32 code) + return (Any rep code) -------------------- shift_code :: MachRep @@ -987,7 +1160,7 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps (y_op, y_code) <- getOperand y -- cannot be clobbered x_code <- getAnyReg x let - widen | signed = CLTD + widen | signed = CLTD rep | otherwise = XOR rep (OpReg edx) (OpReg edx) instr | signed = IDIV @@ -1004,17 +1177,18 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps return (Fixed rep result code) - getRegister (CmmLoad mem pk) | isFloatingRep pk = do Amode src mem_code <- getAmode mem let code dst = mem_code `snocOL` - GLD pk src dst + IF_ARCH_i386(GLD pk src dst, + MOV pk (OpAddr src) (OpReg dst)) -- return (Any pk code) +#if i386_TARGET_ARCH getRegister (CmmLoad mem pk) | pk /= I64 = do @@ -1029,6 +1203,15 @@ getRegister (CmmLoad mem pk) -- we can't guarantee access to an 8-bit variant of every register -- (esi and edi don't have 8-bit variants), so to make things -- simpler we do our 8-bit arithmetic with full 32-bit registers. +#endif + +#if x86_64_TARGET_ARCH +-- Simpler memory load code on x86_64 +getRegister (CmmLoad mem pk) + = do + code <- intLoadCode (MOV pk) mem + return (Any pk code) +#endif getRegister (CmmLit (CmmInt 0 rep)) = let @@ -1065,22 +1248,13 @@ anyReg :: Register -> NatM (Reg -> InstrBlock) anyReg (Any _ code) = return code anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst) --- The dual to getAnyReg: compute an expression into a register, but --- we don't mind which one it is. -getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) -getSomeReg expr = do - r <- getRegister expr - case r of - Any rep code -> do - tmp <- getNewRegNat rep - return (tmp, code tmp) - Fixed _ reg code -> - return (reg, code) - -- A bit like getSomeReg, but we want a reg that can be byte-addressed. -- Fixed registers might not be byte-addressable, so we make sure we've -- got a temporary, inserting an extra reg copy if necessary. getByteReg :: CmmExpr -> NatM (Reg, InstrBlock) +#if x86_64_TARGET_ARCH +getByteReg = getSomeReg -- all regs are byte-addressable on x86_64 +#else getByteReg expr = do r <- getRegister expr case r of @@ -1094,6 +1268,7 @@ getByteReg expr = do return (tmp, code `snocOL` reg2reg rep reg tmp) -- ToDo: could optimise slightly by checking for byte-addressable -- real registers, but that will happen very rarely if at all. +#endif -- Another variant: this time we want the result in a register that cannot -- be modified by code to evaluate an arbitrary expression. @@ -1114,10 +1289,12 @@ getNonClobberedReg expr = do reg2reg :: MachRep -> Reg -> Reg -> Instr reg2reg rep src dst +#if i386_TARGET_ARCH | isFloatingRep rep = GMOV src dst +#endif | otherwise = MOV rep (OpReg src) (OpReg dst) -#endif /* i386_TARGET_ARCH */ +#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1553,19 +1730,6 @@ extendSExpr rep x = CmmMachOp (MO_S_Conv rep I32) [x] extendUExpr I32 x = x extendUExpr rep x = CmmMachOp (MO_U_Conv rep I32) [x] --- ###FIXME: exact code duplication from x86 case --- The dual to getAnyReg: compute an expression into a register, but --- we don't mind which one it is. -getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) -getSomeReg expr = do - r <- getRegister expr - case r of - Any rep code -> do - tmp <- getNewRegNat rep - return (tmp, code tmp) - Fixed _ reg code -> - return (reg, code) - #endif /* powerpc_TARGET_ARCH */ @@ -1639,17 +1803,19 @@ getAmode other -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- This is all just ridiculous, since it carefully undoes -- what mangleIndexTree has just done. -getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit (CmmInt i _)]) +getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)]) + | not (is64BitLit lit) -- ASSERT(rep == I32)??? = do (x_reg, x_code) <- getSomeReg x let off = ImmInt (-(fromInteger i)) return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code) -getAmode (CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt i _)]) +getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)]) + | not (is64BitLit lit) -- ASSERT(rep == I32)??? = do (x_reg, x_code) <- getSomeReg x let off = ImmInt (fromInteger i) @@ -1674,14 +1840,14 @@ getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _) return (Amode (AddrBaseIndex (Just x_reg) (Just (y_reg,base)) (ImmInt 0)) code) -getAmode (CmmLit lit) +getAmode (CmmLit lit) | not (is64BitLit lit) = return (Amode (ImmAddr (litToImm lit) 0) nilOL) getAmode expr = do (reg,code) <- getSomeReg expr return (Amode (AddrBaseIndex (Just reg) Nothing (ImmInt 0)) code) -#endif /* i386_TARGET_ARCH */ +#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1795,17 +1961,19 @@ getAmode other -- ----------------------------------------------------------------------------- -- getOperand: sometimes any operand will do. --- getOperand gets a *safe* operand; that is, the value of the operand --- will remain valid across the computation of an arbitrary expression, --- unless the expression is computed directly into a register which --- the operand refers to (see trivialCode where this function is used --- for an example). +-- getNonClobberedOperand: the value of the operand will remain valid across +-- the computation of an arbitrary expression, unless the expression +-- is computed directly into a register which the operand refers to +-- (see trivialCode where this function is used for an example). -#ifdef i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -getOperand :: CmmExpr -> NatM (Operand, InstrBlock) -getOperand (CmmLoad mem pk) - | not (isFloatingRep pk) && pk /= I64 = do +getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock) +getNonClobberedOperand (CmmLit lit) + | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) = + return (OpImm (litToImm lit), nilOL) +getNonClobberedOperand (CmmLoad mem pk) + | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do Amode src mem_code <- getAmode mem (src',save_code) <- if (amodeCouldBeClobbered src) @@ -1816,8 +1984,7 @@ getOperand (CmmLoad mem pk) else return (src, nilOL) return (OpAddr src', save_code `appOL` mem_code) - -getOperand e = do +getNonClobberedOperand e = do (reg, code) <- getNonClobberedReg e return (OpReg reg, code) @@ -1827,6 +1994,39 @@ amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode) regClobbered (RealReg rr) = isFastTrue (freeReg rr) regClobbered _ = False +-- getOperand: the operand is not required to remain valid across the +-- computation of an arbitrary expression. +getOperand :: CmmExpr -> NatM (Operand, InstrBlock) +getOperand (CmmLit lit) + | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) = + return (OpImm (litToImm lit), nilOL) +getOperand (CmmLoad mem pk) + | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) +getOperand e = do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) + +isOperand :: CmmExpr -> Bool +isOperand (CmmLoad _ _) = True +isOperand (CmmLit lit) = not (is64BitLit lit) && + not (isFloatingRep (cmmLitRep lit)) +isOperand _ = False + +getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock) +getRegOrMem (CmmLoad mem pk) + | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) +getRegOrMem e = do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) + +#if x86_64_TARGET_ARCH +is64BitLit (CmmInt i I64) = i > 0x7fffffff || i < -0x80000000 +#endif +is64BitLit x = False #endif -- ----------------------------------------------------------------------------- @@ -1846,7 +2046,7 @@ getCondCode = panic "MachCode.getCondCode: not on Alphas" -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH || sparc_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH || sparc_TARGET_ARCH -- yes, they really do seem to want exactly the same! getCondCode (CmmMachOp mop [x, y]) @@ -1940,10 +2140,10 @@ condFltCode = panic "MachCode.condFltCode: not on Alphas" #endif /* alpha_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- memory vs immediate -condIntCode cond (CmmLoad x pk) (CmmLit lit) = do +condIntCode cond (CmmLoad x pk) (CmmLit lit) | not (is64BitLit lit) = do Amode x_addr x_code <- getAmode x let imm = litToImm lit @@ -1961,50 +2161,29 @@ condIntCode cond x (CmmLit (CmmInt 0 pk)) = do -- return (CondCode False cond code) --- anything vs immediate -condIntCode cond x (CmmLit lit) = do - (x_reg, x_code) <- getSomeReg x +-- anything vs operand +condIntCode cond x y | isOperand y = do + (x_reg, x_code) <- getNonClobberedReg x + (y_op, y_code) <- getOperand y let - imm = litToImm lit - code = x_code `snocOL` - CMP (cmmLitRep lit) (OpImm imm) (OpReg x_reg) - -- in - return (CondCode False cond code) - --- memory vs anything -condIntCode cond (CmmLoad x pk) y = do - (y_reg, y_code) <- getNonClobberedReg y - Amode x_addr x_code <- getAmode x - let - code = y_code `appOL` - x_code `snocOL` - CMP pk (OpReg y_reg) (OpAddr x_addr) - -- in - return (CondCode False cond code) - --- anything vs memory -condIntCode cond y (CmmLoad x pk) = do - (y_reg, y_code) <- getNonClobberedReg y - Amode x_addr x_code <- getAmode x - let - code = y_code `appOL` - x_code `snocOL` - CMP pk (OpAddr x_addr) (OpReg y_reg) + code = x_code `appOL` y_code `snocOL` + CMP (cmmExprRep x) y_op (OpReg x_reg) -- in return (CondCode False cond code) -- anything vs anything condIntCode cond x y = do - (x_op, x_code) <- getOperand x - (y_reg, y_code) <- getSomeReg y + (y_reg, y_code) <- getNonClobberedReg y + (x_op, x_code) <- getRegOrMem x let - code = x_code `appOL` - y_code `snocOL` + code = y_code `appOL` + x_code `snocOL` CMP (cmmExprRep x) (OpReg y_reg) x_op -- in return (CondCode False cond code) +#endif ------------ +#if i386_TARGET_ARCH condFltCode cond x y = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do (x_reg, x_code) <- getNonClobberedReg x @@ -2015,9 +2194,25 @@ condFltCode cond x y -- The GCMP insn does the test and sets the zero flag if comparable -- and true. Hence we always supply EQQ as the condition to test. return (CondCode True EQQ code) - #endif /* i386_TARGET_ARCH */ +#if x86_64_TARGET_ARCH +-- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be +-- an operand, but the right must be a reg. We can probably do better +-- than this general case... +condFltCode cond x y = do + (x_reg, x_code) <- getNonClobberedReg x + (y_op, y_code) <- getOperand y + let + code = x_code `appOL` + y_code `snocOL` + CMP (cmmExprRep x) y_op (OpReg x_reg) + -- in + return (CondCode False (condToUnsigned cond) code) + -- we need to use the unsigned comparison operators on the + -- result of this comparison. +#endif + -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if sparc_TARGET_ARCH @@ -2171,7 +2366,7 @@ assignIntCode pk dst src -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- integer assignment to memory assignMem_IntCode pk addr src = do @@ -2189,7 +2384,7 @@ assignMem_IntCode pk addr src = do return code where get_op_RI :: CmmExpr -> NatM (InstrBlock,Operand) -- code, operator - get_op_RI (CmmLit lit) + get_op_RI (CmmLit lit) | not (is64BitLit lit) = return (nilOL, OpImm (litToImm lit)) get_op_RI op = do (reg,code) <- getNonClobberedReg op @@ -2298,7 +2493,7 @@ assignFltCode pk dst src -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- Floating point assignment to memory assignMem_FltCode pk addr src = do @@ -2307,7 +2502,8 @@ assignMem_FltCode pk addr src = do let code = src_code `appOL` addr_code `snocOL` - GST pk src_reg addr + IF_ARCH_i386(GST pk src_reg addr, + MOV pk (OpReg src_reg) (OpAddr addr)) return code -- Floating point assignment to a register/temporary @@ -2416,7 +2612,7 @@ genJump tree -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH genJump (CmmLoad mem pk) = do Amode target code <- getAmode mem @@ -2471,7 +2667,7 @@ genBranch :: BlockId -> NatM InstrBlock genBranch id = return (unitOL (BR id)) #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH genBranch id = return (unitOL (JXX ALWAYS id)) #endif @@ -2661,7 +2857,7 @@ genCondJump lbl (StPrim op [x, y]) -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH genCondJump id bool = do CondCode _ cond code <- getCondCode bool @@ -2835,7 +3031,7 @@ genCCall target dest_regs args vols = do toOL ( -- Deallocate parameters after call for ccall; -- but not for stdcall (callee does it) - (if cconv == StdCallConv then [] else + (if cconv == StdCallConv || tot_arg_size==0 then [] else [ADD I32 (OpImm (ImmInt tot_arg_size)) (OpReg esp)]) ++ [DELTA (delta + tot_arg_size)] @@ -2966,6 +3162,159 @@ outOfLineFloatOp mop res args vols -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +#if x86_64_TARGET_ARCH + +genCCall (CmmPrim op) [(r,_)] args vols = + panic "genCCall(CmmPrim)(x86_64)" + +genCCall target dest_regs args vols = do + + -- load up the register arguments + (stack_args, sse_regs, load_args_code) + <- load_args args allArgRegs allFPArgRegs 0 nilOL + + let + tot_arg_size = arg_size * length stack_args + + -- On entry to the called function, %rsp should be aligned + -- on a 16-byte boundary +8 (i.e. the first stack arg after + -- the return address is 16-byte aligned). In STG land + -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just + -- need to make sure we push a multiple of 16-bytes of args, + -- plus the return address, to get the correct alignment. + -- Urg, this is hard. We need to feed the delta back into + -- the arg pushing code. + (real_size, adjust_rsp) <- + if tot_arg_size `rem` 16 == 0 + then return (tot_arg_size, nilOL) + else do -- we need to adjust... + delta <- getDeltaNat + setDeltaNat (delta-8) + return (tot_arg_size+8, toOL [ + SUB I64 (OpImm (ImmInt 8)) (OpReg rsp), + DELTA (delta-8) + ]) + + -- push the stack args, right to left + push_code <- push_args (reverse stack_args) nilOL + delta <- getDeltaNat + + -- deal with static vs dynamic call targets + (callinsns,cconv) <- + case target of + -- CmmPrim -> ... + CmmForeignCall (CmmLit (CmmLabel lbl)) conv + -> -- ToDo: stdcall arg sizes + return (unitOL (CALL (Left fn_imm)), conv) + where fn_imm = ImmCLbl lbl + CmmForeignCall expr conv + -> do (dyn_r, dyn_c) <- getSomeReg expr + return (dyn_c `snocOL` CALL (Right dyn_r), conv) + + let + -- The x86_64 ABI requires us to set %al to the number of SSE + -- registers that contain arguments, if the called routine + -- is a varargs function. We don't know whether it's a + -- varargs function or not, so we have to assume it is. + -- + -- It's not safe to omit this assignment, even if the number + -- of SSE regs in use is zero. If %al is larger than 8 + -- on entry to a varargs function, seg faults ensue. + assign_eax n = unitOL (MOV I32 (OpImm (ImmInt n)) (OpReg eax)) + + let call = callinsns `appOL` + toOL ( + -- Deallocate parameters after call for ccall; + -- but not for stdcall (callee does it) + (if cconv == StdCallConv || real_size==0 then [] else + [ADD wordRep (OpImm (ImmInt real_size)) (OpReg esp)]) + ++ + [DELTA (delta + real_size)] + ) + -- in + setDeltaNat (delta + real_size) + + let + -- assign the results, if necessary + assign_code [] = nilOL + assign_code [(dest,_hint)] = + case rep of + F32 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest)) + F64 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest)) + rep -> unitOL (MOV rep (OpReg rax) (OpReg r_dest)) + where + rep = cmmRegRep dest + r_dest = getRegisterReg dest + assign_code many = panic "genCCall.assign_code many" + + return (load_args_code `appOL` + adjust_rsp `appOL` + push_code `appOL` + assign_eax sse_regs `appOL` + call `appOL` + assign_code dest_regs) + + where + arg_size = 8 -- always, at the mo + + load_args :: [(CmmExpr,MachHint)] + -> [Reg] -- int regs avail for args + -> [Reg] -- FP regs avail for args + -> Int -> InstrBlock + -> NatM ([(CmmExpr,MachHint)],Int,InstrBlock) + load_args args [] [] sse_regs code = return (args, sse_regs, code) + -- no more regs to use + load_args [] aregs fregs sse_regs code = return ([],sse_regs,code) + -- no more args to push + load_args ((arg,hint) : rest) aregs fregs sse_regs code + | isFloatingRep arg_rep = + case fregs of + [] -> push_this_arg + (r:rs) -> do + arg_code <- getAnyReg arg + load_args rest aregs rs (sse_regs+1) (code `appOL` arg_code r) + | otherwise = + case aregs of + [] -> push_this_arg + (r:rs) -> do + arg_code <- getAnyReg arg + load_args rest rs fregs sse_regs (code `appOL` arg_code r) + where + arg_rep = cmmExprRep arg + + push_this_arg = do + (args',sse',code') <- load_args rest aregs fregs sse_regs code + return ((arg,hint):args', sse', code') + + push_args [] code = return code + push_args ((arg,hint):rest) code + | isFloatingRep arg_rep = do + (arg_reg, arg_code) <- getSomeReg arg + delta <- getDeltaNat + setDeltaNat (delta-arg_size) + let code' = code `appOL` toOL [ + MOV arg_rep (OpReg arg_reg) (OpAddr (spRel 0)), + SUB wordRep (OpImm (ImmInt arg_size)) (OpReg rsp) , + DELTA (delta-arg_size)] + push_args rest code' + + | otherwise = do + -- we only ever generate word-sized function arguments. Promotion + -- has already happened: our Int8# type is kept sign-extended + -- in an Int#, for example. + ASSERT(arg_rep == I64) return () + (arg_op, arg_code) <- getOperand arg + delta <- getDeltaNat + setDeltaNat (delta-arg_size) + let code' = code `appOL` toOL [PUSH I64 arg_op, + DELTA (delta-arg_size)] + push_args rest code' + where + arg_rep = cmmExprRep arg +#endif + +-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #if sparc_TARGET_ARCH {- The SPARC calling convention is an absolute @@ -3365,13 +3714,13 @@ genCCall target dest_regs argsAndHints vols genSwitch :: CmmExpr -> [Maybe BlockId] -> NatM InstrBlock -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH genSwitch expr ids = do (reg,e_code) <- getSomeReg expr lbl <- getNewLabelNat let jumpTable = map jumpTableEntry ids - op = OpAddr (AddrBaseIndex Nothing (Just (reg,4)) (ImmCLbl lbl)) + op = OpAddr (AddrBaseIndex Nothing (Just (reg,wORD_SIZE)) (ImmCLbl lbl)) code = e_code `appOL` toOL [ LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable), JMP_TBL op [ id | Just id <- ids ] @@ -3455,7 +3804,7 @@ condFltReg = panic "MachCode.condFltReg (not on Alpha)" -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH condIntReg cond x y = do CondCode _ cond cond_code <- condIntCode cond x y @@ -3658,9 +4007,11 @@ trivialCode -> IF_ARCH_alpha((Reg -> RI -> Reg -> Instr) ,IF_ARCH_i386 ((Operand -> Operand -> Instr) -> Maybe (Operand -> Operand -> Instr) + ,IF_ARCH_x86_64 ((Operand -> Operand -> Instr) + -> Maybe (Operand -> Operand -> Instr) ,IF_ARCH_sparc((Reg -> RI -> Reg -> Instr) ,IF_ARCH_powerpc(Bool -> (Reg -> Reg -> RI -> Instr) - ,)))) + ,))))) -> CmmExpr -> CmmExpr -- the two arguments -> NatM Register @@ -3670,7 +4021,8 @@ trivialFCode -> IF_ARCH_alpha((Reg -> Reg -> Reg -> Instr) ,IF_ARCH_sparc((Size -> Reg -> Reg -> Reg -> Instr) ,IF_ARCH_i386 ((MachRep -> Reg -> Reg -> Reg -> Instr) - ,))) + ,IF_ARCH_x86_64 ((MachRep -> Operand -> Operand -> Instr) + ,)))) -> CmmExpr -> CmmExpr -- the two arguments -> NatM Register #endif @@ -3679,9 +4031,10 @@ trivialUCode :: MachRep -> IF_ARCH_alpha((RI -> Reg -> Instr) ,IF_ARCH_i386 ((Operand -> Instr) + ,IF_ARCH_x86_64 ((Operand -> Instr) ,IF_ARCH_sparc((RI -> Reg -> Instr) ,IF_ARCH_powerpc((Reg -> Reg -> Instr) - ,)))) + ,))))) -> CmmExpr -- the one argument -> NatM Register @@ -3690,8 +4043,9 @@ trivialUFCode :: MachRep -> IF_ARCH_alpha((Reg -> Reg -> Instr) ,IF_ARCH_i386 ((Reg -> Reg -> Instr) + ,IF_ARCH_x86_64 ((Reg -> Reg -> Instr) ,IF_ARCH_sparc((Reg -> Reg -> Instr) - ,))) + ,)))) -> CmmExpr -- the one argument -> NatM Register #endif @@ -3770,7 +4124,7 @@ trivialUFCode _ instr x -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH {- The Rules of the Game are: @@ -3817,16 +4171,8 @@ SDM's version of The Rules: register happens to be the destination register. -} -trivialCode rep instr maybe_revinstr a (CmmLit lit_b) = do - a_code <- getAnyReg a - let - code dst - = a_code dst `snocOL` - instr (OpImm (litToImm lit_b)) (OpReg dst) - -- in - return (Any rep code) - -trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do +trivialCode rep instr (Just revinstr) (CmmLit lit_a) b + | not (is64BitLit lit_a) = do b_code <- getAnyReg b let code dst @@ -3836,7 +4182,7 @@ trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do return (Any rep code) trivialCode rep instr maybe_revinstr a b = do - (b_op, b_code) <- getOperand b + (b_op, b_code) <- getNonClobberedOperand b a_code <- getAnyReg a tmp <- getNewRegNat rep let @@ -3861,6 +4207,7 @@ trivialCode rep instr maybe_revinstr a b = do where reg `clashesWith` OpReg reg2 = reg == reg2 reg `clashesWith` OpAddr amode = any (==reg) (addrModeRegs amode) + reg `clashesWith` _ = False ----------- @@ -3875,6 +4222,8 @@ trivialUCode rep instr x = do ----------- +#if i386_TARGET_ARCH + trivialFCode pk instr x y = do (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too (y_reg, y_code) <- getSomeReg y @@ -3886,6 +4235,26 @@ trivialFCode pk instr x y = do -- in return (Any pk code) +#endif + +#if x86_64_TARGET_ARCH + +-- We use the 2-operand SSE2 floating pt instructions. ToDo: improve on +-- this by using some of the special cases in trivialCode above. +trivialFCode pk instr x y = do + (y_reg, y_code) <- getNonClobberedReg y -- these work for float regs too + x_code <- getAnyReg x + let + code dst = + y_code `appOL` + x_code dst `snocOL` + instr pk (IF_ARCH_x86_64(OpReg,) y_reg) + (IF_ARCH_x86_64(OpReg,) dst) + -- in + return (Any pk code) + +#endif + ------------- trivialUFCode rep instr x = do @@ -4076,7 +4445,7 @@ remainderCode rep div x y = do coerceInt2FP :: MachRep -> MachRep -> CmmExpr -> NatM Register coerceFP2Int :: MachRep -> MachRep -> CmmExpr -> NatM Register -#ifdef sparc_TARGET_ARCH +#if sparc_TARGET_ARCH coerceDbl2Flt :: CmmExpr -> NatM Register coerceFlt2Dbl :: CmmExpr -> NatM Register #endif @@ -4144,6 +4513,37 @@ coerceFP2Int from to x = do -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +#if x86_64_TARGET_ARCH + +coerceFP2Int from to x = do + (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand + let + opc = case to of F32 -> CVTSS2SI; F64 -> CVTSD2SI + code dst = x_code `snocOL` opc x_op dst + -- in + return (Any to code) -- works even if the destination rep is CVTSI2SS; F64 -> CVTSI2SD + code dst = x_code `snocOL` opc x_op dst + -- in + return (Any to code) -- works even if the destination rep is CmmExpr -> NatM Register +coerceFP2FP to x = do + (x_reg, x_code) <- getSomeReg x + let + opc = case to of F32 -> CVTSD2SS; F64 -> CVTSS2SD + code dst = x_code `snocOL` opc x_reg dst + -- in + return (Any to code) + +#endif + +-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #if sparc_TARGET_ARCH coerceInt2FP pk x diff --git a/ghc/compiler/nativeGen/MachInstrs.hs b/ghc/compiler/nativeGen/MachInstrs.hs index 1b662e3..0839694 100644 --- a/ghc/compiler/nativeGen/MachInstrs.hs +++ b/ghc/compiler/nativeGen/MachInstrs.hs @@ -14,24 +14,22 @@ module MachInstrs ( -- * Machine instructions Instr(..), - Cond(..), -#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH + Cond(..), condUnsigned, condToSigned, condToUnsigned, + +#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH Size(..), machRepSize, #endif RI(..), -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH Operand(..), +#endif +#if i386_TARGET_ARCH i386_insert_ffrees, #endif #if sparc_TARGET_ARCH riZero, fpRelEA, moveSp, fPair, #endif -#if powerpc_TARGET_ARCH - condUnsigned, condToSigned, -#endif - DestInfo(..), hasDestInfo, pprDests, - ) where #include "HsVersions.h" @@ -42,7 +40,6 @@ import MachOp ( MachRep(..) ) import CLabel ( CLabel, pprCLabel ) import Panic ( panic ) import Outputable -import Config ( cLeadingUnderscore ) import FastString import GLAEXTS @@ -72,7 +69,7 @@ data Cond | ULE -- For CMP only | ULT -- For CMP only #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH = ALWAYS -- What's really used? ToDo | EQQ | GE @@ -122,6 +119,23 @@ data Cond #endif deriving Eq -- to make an assertion work +condUnsigned GU = True +condUnsigned LU = True +condUnsigned GEU = True +condUnsigned LEU = True +condUnsigned _ = False + +condToSigned GU = GTT +condToSigned LU = LTT +condToSigned GEU = GE +condToSigned LEU = LE +condToSigned x = x + +condToUnsigned GTT = GU +condToUnsigned LTT = LU +condToUnsigned GE = GEU +condToUnsigned LE = LEU +condToUnsigned x = x -- ----------------------------------------------------------------------------- -- Sizes on this architecture @@ -129,7 +143,7 @@ data Cond -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes -- here. I've removed them from the x86 version, we'll see what happens --SDM -#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH +#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH data Size #if alpha_TARGET_ARCH = B -- byte @@ -363,7 +377,7 @@ bit or 64 bit precision. --SDM 1/2003 -} -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- data Instr continues... @@ -371,6 +385,9 @@ bit or 64 bit precision. | MOV MachRep Operand Operand | MOVZxL MachRep Operand Operand -- size is the size of operand 1 | MOVSxL MachRep Operand Operand -- size is the size of operand 1 + -- x86_64 note: plain mov into a 32-bit register always zero-extends + -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which + -- don't affect the high bits of the register. -- Load effective address (also a very useful three-operand add instruction :-) | LEA MachRep Operand Operand @@ -379,9 +396,9 @@ bit or 64 bit precision. | ADD MachRep Operand Operand | ADC MachRep Operand Operand | SUB MachRep Operand Operand - | IMUL MachRep Operand Operand -- signed int mul - | MUL MachRep Operand Operand -- unsigned int mul + | MUL MachRep Operand Operand + | IMUL MachRep Operand Operand -- signed int mul | IMUL64 Reg Reg -- operand1:operand2 := (operand1[31:0] *signed operand2[31:0]) @@ -403,6 +420,7 @@ bit or 64 bit precision. | BT MachRep Imm Operand | NOP +#if i386_TARGET_ARCH -- Float Arithmetic. -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles @@ -442,6 +460,32 @@ bit or 64 bit precision. | GTAN MachRep Reg Reg -- src, dst | GFREE -- do ffree on all x86 regs; an ugly hack +#endif + +#if x86_64_TARGET_ARCH +-- SSE2 floating point: we use a restricted set of the available SSE2 +-- instructions for floating-point. + + -- use MOV for moving (either movss or movsd (movlpd better?)) + + | CVTSS2SD Reg Reg -- F32 to F64 + | CVTSD2SS Reg Reg -- F64 to F32 + | CVTSS2SI Operand Reg -- F32 to I32/I64 (with rounding) + | CVTSD2SI Operand Reg -- F64 to I32/I64 (with rounding) + | CVTSI2SS Operand Reg -- I32/I64 to F32 + | CVTSI2SD Operand Reg -- I32/I64 to F64 + + -- use ADD & SUB for arithmetic. In both cases, operands + -- are Operand Reg. + + -- SSE2 floating-point division: + | FDIV MachRep Operand Operand -- divisor, dividend(dst) + + -- use CMP for comparisons. ucomiss and ucomisd instructions + -- compare single/double prec floating point respectively. + + | SQRT MachRep Operand Reg -- src, dst +#endif -- Comparison | TEST MachRep Operand Operand @@ -462,7 +506,7 @@ bit or 64 bit precision. | CALL (Either Imm Reg) -- Other things. - | CLTD -- sign extend %eax into %edx:%eax + | CLTD MachRep -- sign extend %eax into %edx:%eax | FETCHGOT Reg -- pseudo-insn for position-independent code -- pretty-prints as @@ -475,7 +519,9 @@ data Operand | OpImm Imm -- immediate value | OpAddr AddrMode -- memory reference +#endif /* i386 or x86_64 */ +#if i386_TARGET_ARCH i386_insert_ffrees :: [Instr] -> [Instr] i386_insert_ffrees insns | any is_G_instr insns @@ -506,7 +552,6 @@ is_G_instr instr GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True GFREE -> panic "is_G_instr: GFREE (!)" other -> False - #endif /* i386_TARGET_ARCH */ @@ -670,33 +715,4 @@ fPair other = pprPanic "fPair(sparc NCG)" (ppr other) | FETCHPC Reg -- pseudo-instruction: -- bcl to next insn, mflr reg -condUnsigned GU = True -condUnsigned LU = True -condUnsigned GEU = True -condUnsigned LEU = True -condUnsigned _ = False - -condToSigned GU = GTT -condToSigned LU = LTT -condToSigned GEU = GE -condToSigned LEU = LE -condToSigned x = x #endif /* powerpc_TARGET_ARCH */ - - --- ----------------------------------------------------------------------------- --- DestInfo - --- ToDo: might not be needed anymore --SDM - --- used by insnFuture in RegAllocInfo.lhs -data DestInfo - = NoDestInfo -- no supplied dests; infer from context - | DestInfo [CLabel] -- precisely these dests and no others - -hasDestInfo NoDestInfo = False -hasDestInfo (DestInfo _) = True - -pprDests :: DestInfo -> SDoc -pprDests NoDestInfo = text "NoDestInfo" -pprDests (DestInfo dsts) = brackets (hsep (map pprCLabel dsts)) diff --git a/ghc/compiler/nativeGen/MachRegs.lhs b/ghc/compiler/nativeGen/MachRegs.lhs index a3946a7..44448f6 100644 --- a/ghc/compiler/nativeGen/MachRegs.lhs +++ b/ghc/compiler/nativeGen/MachRegs.lhs @@ -51,6 +51,15 @@ module MachRegs ( fake0, fake1, fake2, fake3, fake4, fake5, addrModeRegs, #endif +#if x86_64_TARGET_ARCH + rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp, + eax, ebx, ecx, edx, esi, edi, ebp, esp, + r8, r9, r10, r11, r12, r13, r14, r15, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + xmm, eax, edx, + addrModeRegs, allFPArgRegs, +#endif #if sparc_TARGET_ARCH fits13Bits, fpRel, gReg, iReg, lReg, oReg, largeOffsetError, @@ -141,7 +150,7 @@ data AddrMode | AddrRegImm Reg Imm #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH = AddrBaseIndex Base Index Displacement | ImmAddr Imm Int @@ -160,7 +169,7 @@ type Displacement = Imm | AddrRegImm Reg Imm #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH addrModeRegs :: AddrMode -> [Reg] addrModeRegs (AddrBaseIndex b i _) = b_regs ++ i_regs where @@ -177,7 +186,7 @@ addrOffset addr off #if alpha_TARGET_ARCH _ -> panic "MachMisc.addrOffset not defined for Alpha" #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH ImmAddr i off0 -> Just (ImmAddr i (off0 + off)) AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off))) @@ -280,8 +289,10 @@ spRel :: Int -- desired stack offset in words, positive or negative -> AddrMode spRel n -#if i386_TARGET_ARCH +#if defined(i386_TARGET_ARCH) = AddrBaseIndex (Just esp) Nothing (ImmInt (n * wORD_SIZE)) +#elif defined(x86_64_TARGET_ARCH) + = AddrBaseIndex (Just rsp) Nothing (ImmInt (n * wORD_SIZE)) #else = AddrRegImm sp (ImmInt (n * wORD_SIZE)) #endif @@ -497,6 +508,88 @@ showReg n #endif {- +AMD x86_64 architecture: +- Registers 0-16 have 32-bit counterparts (eax, ebx etc.) +- Registers 0-7 have 16-bit counterparts (ax, bx etc.) +- Registers 0-3 have 8 bit counterparts (ah, bh etc.) + +-} + +#if x86_64_TARGET_ARCH + +rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi, + r8, r9, r10, r11, r12, r13, r14, r15, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg + +rax = RealReg 0 +rbx = RealReg 1 +rcx = RealReg 2 +rdx = RealReg 3 +rsi = RealReg 4 +rdi = RealReg 5 +rbp = RealReg 6 +rsp = RealReg 7 +r8 = RealReg 8 +r9 = RealReg 9 +r10 = RealReg 10 +r11 = RealReg 11 +r12 = RealReg 12 +r13 = RealReg 13 +r14 = RealReg 14 +r15 = RealReg 15 +xmm0 = RealReg 16 +xmm1 = RealReg 17 +xmm2 = RealReg 18 +xmm3 = RealReg 19 +xmm4 = RealReg 20 +xmm5 = RealReg 21 +xmm6 = RealReg 22 +xmm7 = RealReg 23 +xmm8 = RealReg 24 +xmm9 = RealReg 25 +xmm10 = RealReg 26 +xmm11 = RealReg 27 +xmm12 = RealReg 28 +xmm13 = RealReg 29 +xmm14 = RealReg 30 +xmm15 = RealReg 31 + + -- so we can re-use some x86 code: +eax = rax +ebx = rbx +ecx = rcx +edx = rdx +esi = rsi +edi = rdi +ebp = rbp +esp = rsp + +xmm n = RealReg (16+n) + +-- On x86, we might want to have an 8-bit RegClass, which would +-- contain just regs 1-4 (the others don't have 8-bit versions). +-- However, we can get away without this at the moment because the +-- only allocatable integer regs are also 8-bit compatible (1, 3, 4). +regClass (RealReg i) = if i < 16 then RcInteger else RcDouble +regClass (VirtualRegI u) = RcInteger +regClass (VirtualRegHi u) = RcInteger +regClass (VirtualRegD u) = RcDouble +regClass (VirtualRegF u) = pprPanic "regClass(x86_64):VirtualRegF" + (ppr (VirtualRegF u)) + +regNames + = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ] + +showReg :: RegNo -> String +showReg n + | n >= 16 = "%xmm" ++ show n + | n >= 8 = "%r" ++ show n + | otherwise = regNames !! n + +#endif + +{- The SPARC has 64 registers of interest; 32 integer registers and 32 floating point registers. The mapping of STG registers to SPARC machine registers is defined in StgRegs.h. We are, of course, @@ -647,6 +740,42 @@ names in the header files. Gag me with a spoon, eh? #define fake4 12 #define fake5 13 #endif + +#if x86_64_TARGET_ARCH +#define rax 0 +#define rbx 1 +#define rcx 2 +#define rdx 3 +#define rsi 4 +#define rdi 5 +#define rbp 6 +#define rsp 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define xmm0 16 +#define xmm1 17 +#define xmm2 18 +#define xmm3 19 +#define xmm4 20 +#define xmm5 21 +#define xmm6 22 +#define xmm7 23 +#define xmm8 24 +#define xmm9 25 +#define xmm10 26 +#define xmm11 27 +#define xmm12 28 +#define xmm13 29 +#define xmm14 30 +#define xmm15 31 +#endif + #if sparc_TARGET_ARCH #define g0 0 #define g1 1 @@ -824,11 +953,12 @@ allMachRegNos :: [RegNo] allMachRegNos = IF_ARCH_alpha( [0..63], IF_ARCH_i386( [0..13], + IF_ARCH_x86_64( [0..31], IF_ARCH_sparc( ([0..31] ++ [f0,f2 .. nCG_FirstFloatReg-1] ++ [nCG_FirstFloatReg .. f31]), IF_ARCH_powerpc([0..63], - )))) + ))))) -- allocatableRegs is allMachRegNos with the fixed-use regs removed. -- i.e., these are the regs for which we are prepared to allow the @@ -854,6 +984,11 @@ callClobberedRegs -- caller-saves registers map RealReg [eax,ecx,edx,fake0,fake1,fake2,fake3,fake4,fake5] #endif /* i386_TARGET_ARCH */ +#if x86_64_TARGET_ARCH + -- caller-saves registers + map RealReg ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..31]) + -- all xmm regs are caller-saves +#endif /* x86_64_TARGET_ARCH */ #if sparc_TARGET_ARCH map RealReg ( oReg 7 : @@ -880,6 +1015,10 @@ argRegs :: RegNo -> [Reg] argRegs _ = panic "MachRegs.argRegs(x86): should not be used!" #endif +#if x86_64_TARGET_ARCH +argRegs _ = panic "MachRegs.argRegs(x86_64): should not be used!" +#endif + #if alpha_TARGET_ARCH argRegs 0 = [] argRegs 1 = freeMappedRegs [16, fReg 16] @@ -932,6 +1071,13 @@ allArgRegs :: [Reg] allArgRegs = panic "MachRegs.allArgRegs(x86): should not be used!" #endif +#if x86_64_TARGET_ARCH +allArgRegs :: [Reg] +allArgRegs = map RealReg [rdi,rsi,rdx,rcx,r8,r9] +allFPArgRegs :: [Reg] +allFPArgRegs = map RealReg [xmm0 .. xmm7] +#endif + #if powerpc_TARGET_ARCH allArgRegs :: [Reg] allArgRegs = map RealReg [3..10] @@ -960,6 +1106,10 @@ freeReg 63 = fastBool False -- always zero (f31) freeReg esp = fastBool False -- %esp is the C stack pointer #endif +#if x86_64_TARGET_ARCH +freeReg rsp = fastBool False -- %rsp is the C stack pointer +#endif + #if sparc_TARGET_ARCH freeReg g0 = fastBool False -- %g0 is always 0. freeReg g5 = fastBool False -- %g5 is reserved (ABI). diff --git a/ghc/compiler/nativeGen/NCG.h b/ghc/compiler/nativeGen/NCG.h index 78db0c9..b17f682 100644 --- a/ghc/compiler/nativeGen/NCG.h +++ b/ghc/compiler/nativeGen/NCG.h @@ -26,6 +26,12 @@ # define IF_ARCH_i386(x,y) y #endif -- - - - - - - - - - - - - - - - - - - - - - +#if x86_64_TARGET_ARCH +# define IF_ARCH_x86_64(x,y) x +#else +# define IF_ARCH_x86_64(x,y) y +#endif +-- - - - - - - - - - - - - - - - - - - - - - #if freebsd_TARGET_OS # define IF_OS_freebsd(x,y) x #else diff --git a/ghc/compiler/nativeGen/PprMach.hs b/ghc/compiler/nativeGen/PprMach.hs index 26b192f..197a82a 100644 --- a/ghc/compiler/nativeGen/PprMach.hs +++ b/ghc/compiler/nativeGen/PprMach.hs @@ -2,8 +2,8 @@ -- -- Pretty-printing assembly language -- - -- (c) The University of Glasgow 1993-2004 - -- +-- (c) The University of Glasgow 1993-2005 +-- ----------------------------------------------------------------------------- -- We start with the @pprXXX@s with some cross-platform commonality @@ -21,7 +21,7 @@ module PprMach ( #include "HsVersions.h" import Cmm -import MachOp ( MachRep(..) ) +import MachOp ( MachRep(..), wordRep, isFloatingRep ) import MachRegs -- may differ per-platform import MachInstrs @@ -115,13 +115,13 @@ pprBasicBlock (BasicBlock (BlockId id) instrs) = -- on which bit of it we care about. Yurgh. pprUserReg :: Reg -> Doc -pprUserReg = pprReg IF_ARCH_i386(I32,) +pprUserReg = pprReg IF_ARCH_i386(I32,) IF_ARCH_x86_64(I64,) -pprReg :: IF_ARCH_i386(MachRep ->,) Reg -> Doc +pprReg :: IF_ARCH_i386(MachRep ->, IF_ARCH_x86_64(MachRep ->,)) Reg -> Doc -pprReg IF_ARCH_i386(s,) r +pprReg IF_ARCH_i386(s, IF_ARCH_x86_64(s,)) r = case r of - RealReg i -> ppr_reg_no IF_ARCH_i386(s,) i + RealReg i -> ppr_reg_no IF_ARCH_i386(s, IF_ARCH_x86_64(s,)) i VirtualRegI u -> text "%vI_" <> asmSDoc (pprUnique u) VirtualRegHi u -> text "%vHi_" <> asmSDoc (pprUnique u) VirtualRegF u -> text "%vF_" <> asmSDoc (pprUnique u) @@ -200,6 +200,74 @@ pprReg IF_ARCH_i386(s,) r _ -> SLIT("very naughty I386 register") }) #endif + +#if x86_64_TARGET_ARCH + ppr_reg_no :: MachRep -> Int -> Doc + ppr_reg_no I8 = ppr_reg_byte + ppr_reg_no I16 = ppr_reg_word + ppr_reg_no I32 = ppr_reg_long + ppr_reg_no _ = ppr_reg_quad + + ppr_reg_byte i = ptext + (case i of { + 0 -> SLIT("%al"); 1 -> SLIT("%bl"); + 2 -> SLIT("%cl"); 3 -> SLIT("%dl"); + 4 -> SLIT("%sil"); 5 -> SLIT("%dil"); -- new 8-bit regs! + 6 -> SLIT("%bpl"); 7 -> SLIT("%spl"); + 8 -> SLIT("%r8b"); 9 -> SLIT("%r9b"); + 10 -> SLIT("%r10b"); 11 -> SLIT("%r11b"); + 12 -> SLIT("%r12b"); 13 -> SLIT("%r13b"); + 14 -> SLIT("%r14b"); 15 -> SLIT("%r15b"); + _ -> SLIT("very naughty x86_64 byte register") + }) + + ppr_reg_word i = ptext + (case i of { + 0 -> SLIT("%ax"); 1 -> SLIT("%bx"); + 2 -> SLIT("%cx"); 3 -> SLIT("%dx"); + 4 -> SLIT("%si"); 5 -> SLIT("%di"); + 6 -> SLIT("%bp"); 7 -> SLIT("%sp"); + 8 -> SLIT("%r8w"); 9 -> SLIT("%r9w"); + 10 -> SLIT("%r10w"); 11 -> SLIT("%r11w"); + 12 -> SLIT("%r12w"); 13 -> SLIT("%r13w"); + 14 -> SLIT("%r14w"); 15 -> SLIT("%r15w"); + _ -> SLIT("very naughty x86_64 word register") + }) + + ppr_reg_long i = ptext + (case i of { + 0 -> SLIT("%eax"); 1 -> SLIT("%ebx"); + 2 -> SLIT("%ecx"); 3 -> SLIT("%edx"); + 4 -> SLIT("%esi"); 5 -> SLIT("%edi"); + 6 -> SLIT("%ebp"); 7 -> SLIT("%esp"); + 8 -> SLIT("%r8d"); 9 -> SLIT("%r9d"); + 10 -> SLIT("%r10d"); 11 -> SLIT("%r11d"); + 12 -> SLIT("%r12d"); 13 -> SLIT("%r13d"); + 14 -> SLIT("%r14d"); 15 -> SLIT("%r15d"); + _ -> SLIT("very naughty x86_64 register") + }) + + ppr_reg_quad i = ptext + (case i of { + 0 -> SLIT("%rax"); 1 -> SLIT("%rbx"); + 2 -> SLIT("%rcx"); 3 -> SLIT("%rdx"); + 4 -> SLIT("%rsi"); 5 -> SLIT("%rdi"); + 6 -> SLIT("%rbp"); 7 -> SLIT("%rsp"); + 8 -> SLIT("%r8"); 9 -> SLIT("%r9"); + 10 -> SLIT("%r10"); 11 -> SLIT("%r11"); + 12 -> SLIT("%r12"); 13 -> SLIT("%r13"); + 14 -> SLIT("%r14"); 15 -> SLIT("%r15"); + 16 -> SLIT("%xmm0"); 17 -> SLIT("%xmm1"); + 18 -> SLIT("%xmm2"); 19 -> SLIT("%xmm3"); + 20 -> SLIT("%xmm4"); 21 -> SLIT("%xmm5"); + 22 -> SLIT("%xmm6"); 23 -> SLIT("%xmm7"); + 24 -> SLIT("%xmm8"); 25 -> SLIT("%xmm9"); + 26 -> SLIT("%xmm10"); 27 -> SLIT("%xmm11"); + 28 -> SLIT("%xmm12"); 28 -> SLIT("%xmm13"); + 30 -> SLIT("%xmm13"); 31 -> SLIT("%xmm15") + }) +#endif + #if sparc_TARGET_ARCH ppr_reg_no :: Int -> Doc ppr_reg_no i = ptext @@ -290,7 +358,7 @@ pprReg IF_ARCH_i386(s,) r -- ----------------------------------------------------------------------------- -- pprSize: print a 'Size' -#if powerpc_TARGET_ARCH || i386_TARGET_ARCH +#if powerpc_TARGET_ARCH || i386_TARGET_ARCH || x86_64_TARGET_ARCH pprSize :: MachRep -> Doc #else pprSize :: Size -> Doc @@ -310,14 +378,21 @@ pprSize x = ptext (case x of -- SF -> SLIT("s") UNUSED TF -> SLIT("t") #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH I8 -> SLIT("b") I16 -> SLIT("w") I32 -> SLIT("l") - F32 -> SLIT("s") - F64 -> SLIT("l") + I64 -> SLIT("q") +#endif +#if i386_TARGET_ARCH + F32 -> SLIT("l") + F64 -> SLIT("q") F80 -> SLIT("t") #endif +#if x86_64_TARGET_ARCH + F32 -> SLIT("ss") -- "scalar single-precision float" (SSE2) + F64 -> SLIT("sd") -- "scalar double-precision float" (SSE2) +#endif #if sparc_TARGET_ARCH B -> SLIT("sb") Bu -> SLIT("ub") @@ -362,9 +437,9 @@ pprCond c = ptext (case c of { GTT -> SLIT("gt"); GE -> SLIT("ge") #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH GEU -> SLIT("ae"); LU -> SLIT("b"); - EQQ -> SLIT("e"); GTT -> SLIT("g"); + EQQ -> SLIT("e"); GTT -> SLIT("g"); GE -> SLIT("ge"); GU -> SLIT("a"); LTT -> SLIT("l"); LE -> SLIT("le"); LEU -> SLIT("be"); NE -> SLIT("ne"); @@ -466,7 +541,7 @@ pprAddr (AddrRegImm r1 i) ------------------- -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH pprAddr (ImmAddr imm off) = let pp_imm = pprImm imm in @@ -481,7 +556,7 @@ pprAddr (AddrBaseIndex base index displacement) = let pp_disp = ppr_disp displacement pp_off p = pp_disp <> char '(' <> p <> char ')' - pp_reg r = pprReg I32 r + pp_reg r = pprReg wordRep r in case (base,index) of (Nothing, Nothing) -> pp_disp @@ -540,39 +615,59 @@ pprSectionHeader Text IF_ARCH_alpha(SLIT("\t.text\n\t.align 3") {-word boundary-} ,IF_ARCH_sparc(SLIT(".text\n\t.align 4") {-word boundary-} ,IF_ARCH_i386(SLIT(".text\n\t.align 4,0x90") {-needs per-OS variation!-} + ,IF_ARCH_x86_64(SLIT(".text\n\t.align 8") {-needs per-OS variation!-} ,IF_ARCH_powerpc(SLIT(".text\n.align 2") - ,)))) + ,))))) pprSectionHeader Data = ptext IF_ARCH_alpha(SLIT("\t.data\n\t.align 3") ,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -} ,IF_ARCH_i386(SLIT(".data\n\t.align 4") + ,IF_ARCH_x86_64(SLIT(".data\n\t.align 8") ,IF_ARCH_powerpc(SLIT(".data\n.align 2") - ,)))) + ,))))) pprSectionHeader ReadOnlyData = ptext IF_ARCH_alpha(SLIT("\t.data\n\t.align 3") ,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -} ,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 4") + ,IF_ARCH_x86_64(SLIT(".section .rodata\n\t.align 8") ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const\n.align 2"), SLIT(".section .rodata\n\t.align 2")) - ,)))) + ,))))) pprSectionHeader RelocatableReadOnlyData = ptext IF_ARCH_alpha(SLIT("\t.data\n\t.align 3") ,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -} - ,IF_ARCH_i386(SLIT(".data\n\t.align 4") + ,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 4") + ,IF_ARCH_x86_64(SLIT(".text\n\t.align 8") ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const_data\n.align 2"), SLIT(".data\n\t.align 2")) - ,)))) + ,))))) + -- the assembler on x86_64/Linux refuses to generate code for + -- .quad x - y + -- where x is in the text section and y in the rodata section. + -- It works if y is in the text section, though. This is probably + -- going to cause difficulties for PIC, I imagine. pprSectionHeader UninitialisedData = ptext IF_ARCH_alpha(SLIT("\t.bss\n\t.align 3") ,IF_ARCH_sparc(SLIT(".bss\n\t.align 8") {-<8 will break double constants -} ,IF_ARCH_i386(SLIT(".section .bss\n\t.align 4") + ,IF_ARCH_x86_64(SLIT(".section .bss\n\t.align 8") ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const_data\n.align 2"), SLIT(".section .bss\n\t.align 2")) - ,)))) + ,))))) +pprSectionHeader ReadOnlyData16 + = ptext + IF_ARCH_alpha(SLIT("\t.data\n\t.align 4") + ,IF_ARCH_sparc(SLIT(".data\n\t.align 16") + ,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 16") + ,IF_ARCH_x86_64(SLIT(".section .rodata.cst16\n\t.align 16") + ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const\n.align 4"), + SLIT(".section .rodata\n\t.align 4")) + ,))))) + pprSectionHeader (OtherSection sec) = panic "PprMach.pprSectionHeader: unknown section" @@ -586,11 +681,8 @@ pprData (CmmStaticLit lit) = pprDataItem lit pprGloblDecl :: CLabel -> Doc pprGloblDecl lbl | not (externallyVisibleCLabel lbl) = empty - | otherwise = ptext IF_ARCH_alpha(SLIT(".globl ") - ,IF_ARCH_i386(SLIT(".globl ") - ,IF_ARCH_sparc(SLIT(".global ") - ,IF_ARCH_powerpc(SLIT(".globl ") - ,)))) <> + | otherwise = ptext IF_ARCH_sparc(SLIT(".global "), + SLIT(".globl ")) <> pprCLabel_asm lbl pprLabel :: CLabel -> Doc @@ -612,8 +704,9 @@ pprASCII str pprAlign bytes = IF_ARCH_alpha(ptextSLIT(".align ") <> int pow2, IF_ARCH_i386(ptext SLIT(".align ") <> int bytes, + IF_ARCH_x86_64(ptext SLIT(".align ") <> int bytes, IF_ARCH_sparc(ptext SLIT(".align ") <> int bytes, - IF_ARCH_powerpc(ptext SLIT(".align ") <> int pow2,)))) + IF_ARCH_powerpc(ptext SLIT(".align ") <> int pow2,))))) where pow2 = log2 bytes @@ -646,7 +739,7 @@ pprDataItem lit ppr_item I16 x = [ptext SLIT("\t.short\t") <> pprImm imm] ppr_item I64 x = [ptext SLIT("\t.quad\t") <> pprImm imm] #endif -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH ppr_item I16 x = [ptext SLIT("\t.word\t") <> pprImm imm] ppr_item I64 x = [ptext SLIT("\t.quad\t") <> pprImm imm] #endif @@ -672,10 +765,11 @@ pprInstr (COMMENT s) = IF_ARCH_alpha( ((<>) (ptext SLIT("\t# ")) (ftext s)) ,IF_ARCH_sparc( ((<>) (ptext SLIT("! ")) (ftext s)) ,IF_ARCH_i386( ((<>) (ptext SLIT("# ")) (ftext s)) + ,IF_ARCH_x86_64( ((<>) (ptext SLIT("# ")) (ftext s)) ,IF_ARCH_powerpc( IF_OS_linux( ((<>) (ptext SLIT("# ")) (ftext s)), ((<>) (ptext SLIT("; ")) (ftext s))) - ,)))) + ,))))) pprInstr (DELTA d) = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d))) @@ -1071,7 +1165,7 @@ pprSizeRegRegReg name size reg1 reg2 reg3 -- ----------------------------------------------------------------------------- -- pprInstr for an x86 -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH pprInstr v@(MOV size s@(OpReg src) d@(OpReg dst)) -- hack | src == dst @@ -1081,10 +1175,18 @@ pprInstr v@(MOV size s@(OpReg src) d@(OpReg dst)) -- hack #else empty #endif + pprInstr (MOV size src dst) = pprSizeOpOp SLIT("mov") size src dst -pprInstr (MOVZxL sizes src dst) = pprSizeOpOpCoerce SLIT("movz") sizes I32 src dst -pprInstr (MOVSxL sizes src dst) = pprSizeOpOpCoerce SLIT("movs") sizes I32 src dst + +pprInstr (MOVZxL I32 src dst) = pprSizeOpOp SLIT("mov") I32 src dst + -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple + -- movl. But we represent it as a MOVZxL instruction, because + -- the reg alloc would tend to throw away a plain reg-to-reg + -- move, and we still want it to do that. + +pprInstr (MOVZxL sizes src dst) = pprSizeOpOpCoerce SLIT("movz") sizes wordRep src dst +pprInstr (MOVSxL sizes src dst) = pprSizeOpOpCoerce SLIT("movs") sizes wordRep src dst -- here we do some patching, since the physical registers are only set late -- in the code generation. @@ -1117,11 +1219,13 @@ pprInstr (IMUL size op1 op2) = pprSizeOpOp SLIT("imul") size op1 op2 however, cannot be used to determine if the upper half of the result is non-zero." So there. -} -pprInstr (MUL size op1 op2) = pprSizeOpOp SLIT("imul") size op1 op2 - pprInstr (AND size src dst) = pprSizeOpOp SLIT("and") size src dst pprInstr (OR size src dst) = pprSizeOpOp SLIT("or") size src dst + +pprInstr (XOR F32 src dst) = pprOpOp SLIT("xorps") F32 src dst +pprInstr (XOR F64 src dst) = pprOpOp SLIT("xorpd") F64 src dst pprInstr (XOR size src dst) = pprSizeOpOp SLIT("xor") size src dst + pprInstr (NOT size op) = pprSizeOp SLIT("not") size op pprInstr (NEGI size op) = pprSizeOp SLIT("neg") size op @@ -1131,7 +1235,10 @@ pprInstr (SHR size src dst) = pprShift SLIT("shr") size src dst pprInstr (BT size imm src) = pprSizeImmOp SLIT("bt") size imm src -pprInstr (CMP size src dst) = pprSizeOpOp SLIT("cmp") size src dst +pprInstr (CMP size src dst) + | isFloatingRep size = pprSizeOpOp SLIT("ucomi") size src dst -- SSE2 + | otherwise = pprSizeOpOp SLIT("cmp") size src dst + pprInstr (TEST size src dst) = pprSizeOpOp SLIT("test") size src dst pprInstr (PUSH size op) = pprSizeOp SLIT("push") size op pprInstr (POP size op) = pprSizeOp SLIT("pop") size op @@ -1141,7 +1248,8 @@ pprInstr (POP size op) = pprSizeOp SLIT("pop") size op -- pprInstr POPA = ptext SLIT("\tpopal") pprInstr NOP = ptext SLIT("\tnop") -pprInstr CLTD = ptext SLIT("\tcltd") +pprInstr (CLTD I32) = ptext SLIT("\tcltd") +pprInstr (CLTD I64) = ptext SLIT("\tcqto") pprInstr (SETCC cond op) = pprCondInstr SLIT("set") cond (pprOperand I8 op) @@ -1150,17 +1258,42 @@ pprInstr (JXX cond (BlockId id)) where lab = mkAsmTempLabel id pprInstr (JMP (OpImm imm)) = (<>) (ptext SLIT("\tjmp ")) (pprImm imm) -pprInstr (JMP op) = (<>) (ptext SLIT("\tjmp *")) (pprOperand I32 op) +pprInstr (JMP op) = (<>) (ptext SLIT("\tjmp *")) (pprOperand wordRep op) pprInstr (JMP_TBL op ids) = pprInstr (JMP op) pprInstr (CALL (Left imm)) = (<>) (ptext SLIT("\tcall ")) (pprImm imm) -pprInstr (CALL (Right reg)) = (<>) (ptext SLIT("\tcall *")) (pprReg I32 reg) +pprInstr (CALL (Right reg)) = (<>) (ptext SLIT("\tcall *")) (pprReg wordRep reg) pprInstr (IDIV sz op) = pprSizeOp SLIT("idiv") sz op pprInstr (DIV sz op) = pprSizeOp SLIT("div") sz op pprInstr (IMUL64 sd_hi sd_lo) = pprInstr_imul64 sd_hi sd_lo +#if x86_64_TARGET_ARCH +pprInstr (MUL size op1 op2) = pprSizeOpOp SLIT("mul") size op1 op2 + +pprInstr (FDIV size op1 op2) = pprSizeOpOp SLIT("div") size op1 op2 + +pprInstr (CVTSS2SD from to) = pprRegReg SLIT("cvtss2sd") from to +pprInstr (CVTSD2SS from to) = pprRegReg SLIT("cvtsd2ss") from to +pprInstr (CVTSS2SI from to) = pprOpReg SLIT("cvtss2si") from to +pprInstr (CVTSD2SI from to) = pprOpReg SLIT("cvtsd2si") from to +pprInstr (CVTSI2SS from to) = pprOpReg SLIT("cvtsi2ss") from to +pprInstr (CVTSI2SD from to) = pprOpReg SLIT("cvtsi2sd") from to +#endif + +pprInstr (FETCHGOT reg) + = vcat [ ptext SLIT("\tcall 1f"), + hcat [ ptext SLIT("1:\tpopl\t"), pprReg I32 reg ], + hcat [ ptext SLIT("\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), "), + pprReg I32 reg ] + ] +#endif + +-- ----------------------------------------------------------------------------- +-- i386 floating-point + +#if i386_TARGET_ARCH -- Simulating a flat register set on the x86 FP stack is tricky. -- you have to free %st(7) before pushing anything on the FP reg stack -- so as to preclude the possibility of a FP stack overflow exception. @@ -1357,31 +1490,6 @@ pprInstr GFREE ptext SLIT("\tffree %st(4) ;ffree %st(5) ;ffree %st(6) ;ffree %st(7)") ] -pprInstr (FETCHGOT reg) - = vcat [ ptext SLIT("\tcall 1f"), - hcat [ ptext SLIT("1:\tpopl\t"), pprReg I32 reg ], - hcat [ ptext SLIT("\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), "), - pprReg I32 reg ] - ] - --- Emit code to make hi_reg:lo_reg be the 64-bit product of hi_reg and lo_reg -pprInstr_imul64 hi_reg lo_reg - = let fakeInsn = text "imul64" <+> pp_hi_reg <> comma <+> pp_lo_reg - pp_hi_reg = pprReg I32 hi_reg - pp_lo_reg = pprReg I32 lo_reg - in - vcat [ - text "\t# BEGIN " <> fakeInsn, - text "\tpushl" <+> pp_hi_reg <> text" ; pushl" <+> pp_lo_reg, - text "\tpushl %eax ; pushl %edx", - text "\tmovl 12(%esp), %eax ; imull 8(%esp)", - text "\tmovl %edx, 12(%esp) ; movl %eax, 8(%esp)", - text "\tpopl %edx ; popl %eax", - text "\tpopl" <+> pp_lo_reg <> text " ; popl" <+> pp_hi_reg, - text "\t# END " <> fakeInsn - ] - - -------------------------- -- coerce %st(0) to the specified size @@ -1431,7 +1539,26 @@ pprGInstr (GADD sz src1 src2 dst) = pprSizeRegRegReg SLIT("gadd") sz src1 src2 d pprGInstr (GSUB sz src1 src2 dst) = pprSizeRegRegReg SLIT("gsub") sz src1 src2 dst pprGInstr (GMUL sz src1 src2 dst) = pprSizeRegRegReg SLIT("gmul") sz src1 src2 dst pprGInstr (GDIV sz src1 src2 dst) = pprSizeRegRegReg SLIT("gdiv") sz src1 src2 dst +#endif + +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH +-- Emit code to make hi_reg:lo_reg be the 64-bit product of hi_reg and lo_reg +pprInstr_imul64 hi_reg lo_reg + = let fakeInsn = text "imul64" <+> pp_hi_reg <> comma <+> pp_lo_reg + pp_hi_reg = pprReg wordRep hi_reg + pp_lo_reg = pprReg wordRep lo_reg + in + vcat [ + text "\t# BEGIN " <> fakeInsn, + text "\tpushl" <+> pp_hi_reg <> text" ; pushl" <+> pp_lo_reg, + text "\tpushl %eax ; pushl %edx", + text "\tmovl 12(%esp), %eax ; imull 8(%esp)", + text "\tmovl %edx, 12(%esp) ; movl %eax, 8(%esp)", + text "\tpopl %edx ; popl %eax", + text "\tpopl" <+> pp_lo_reg <> text " ; popl" <+> pp_hi_reg, + text "\t# END " <> fakeInsn + ] -- Continue with I386-only printing bits and bobs: pprDollImm :: Imm -> Doc @@ -1443,6 +1570,10 @@ pprOperand s (OpReg r) = pprReg s r pprOperand s (OpImm i) = pprDollImm i pprOperand s (OpAddr ea) = pprAddr ea +pprMnemonic_ :: LitString -> Doc +pprMnemonic_ name = + char '\t' <> ptext name <> space + pprMnemonic :: LitString -> MachRep -> Doc pprMnemonic name size = char '\t' <> ptext name <> pprSize size <> space @@ -1473,6 +1604,15 @@ pprSizeOpOp name size op1 op2 pprOperand size op2 ] +pprOpOp :: LitString -> MachRep -> Operand -> Operand -> Doc +pprOpOp name size op1 op2 + = hcat [ + pprMnemonic_ name, + pprOperand size op1, + comma, + pprOperand size op2 + ] + pprSizeReg :: LitString -> MachRep -> Reg -> Doc pprSizeReg name size reg1 = hcat [ @@ -1489,6 +1629,24 @@ pprSizeRegReg name size reg1 reg2 pprReg size reg2 ] +pprRegReg :: LitString -> Reg -> Reg -> Doc +pprRegReg name reg1 reg2 + = hcat [ + pprMnemonic_ name, + pprReg wordRep reg1, + comma, + pprReg wordRep reg2 + ] + +pprOpReg :: LitString -> Operand -> Reg -> Doc +pprOpReg name op1 reg2 + = hcat [ + pprMnemonic_ name, + pprOperand wordRep op1, + comma, + pprReg wordRep reg2 + ] + pprCondRegReg :: LitString -> MachRep -> Cond -> Reg -> Reg -> Doc pprCondRegReg name size cond reg1 reg2 = hcat [ diff --git a/ghc/compiler/nativeGen/RegAllocInfo.hs b/ghc/compiler/nativeGen/RegAllocInfo.hs index 7d1bf48..6b929e5 100644 --- a/ghc/compiler/nativeGen/RegAllocInfo.hs +++ b/ghc/compiler/nativeGen/RegAllocInfo.hs @@ -24,7 +24,7 @@ module RegAllocInfo ( #include "HsVersions.h" import Cmm ( BlockId ) -#if powerpc_TARGET_ARCH || i386_TARGET_ARCH +#if powerpc_TARGET_ARCH || i386_TARGET_ARCH || x86_64_TARGET_ARCH import MachOp ( MachRep(..) ) #endif import MachInstrs @@ -138,7 +138,7 @@ regUsage instr = case instr of #endif /* alpha_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH regUsage instr = case instr of MOV sz src dst -> usageRW src dst @@ -173,9 +173,10 @@ regUsage instr = case instr of JMP_TBL op ids -> mkRU (use_R op) [] CALL (Left imm) -> mkRU [] callClobberedRegs CALL (Right reg) -> mkRU [reg] callClobberedRegs - CLTD -> mkRU [eax] [edx] + CLTD sz -> mkRU [eax] [edx] NOP -> mkRU [] [] +#if i386_TARGET_ARCH GMOV src dst -> mkRU [src] [dst] GLD sz src dst -> mkRU (use_EA src) [dst] GST sz src dst -> mkRU (src : use_EA dst) [] @@ -201,6 +202,17 @@ regUsage instr = case instr of GSIN sz src dst -> mkRU [src] [dst] GCOS sz src dst -> mkRU [src] [dst] GTAN sz src dst -> mkRU [src] [dst] +#endif + +#if x86_64_TARGET_ARCH + CVTSS2SD src dst -> mkRU [src] [dst] + CVTSD2SS src dst -> mkRU [src] [dst] + CVTSS2SI src dst -> mkRU (use_R src) [dst] + CVTSD2SI src dst -> mkRU (use_R src) [dst] + CVTSI2SS src dst -> mkRU (use_R src) [dst] + CVTSI2SD src dst -> mkRU (use_R src) [dst] + FDIV sz src dst -> usageRM src dst +#endif FETCHGOT reg -> mkRU [] [reg] @@ -244,7 +256,7 @@ regUsage instr = case instr of mkRU src dst = RU (filter interesting src) (filter interesting dst) -#endif /* i386_TARGET_ARCH */ +#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if sparc_TARGET_ARCH @@ -370,7 +382,7 @@ regUsage instr = case instr of jumpDests :: Instr -> [BlockId] -> [BlockId] jumpDests insn acc = case insn of -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH JXX _ id -> id : acc JMP_TBL _ ids -> ids ++ acc #elif powerpc_TARGET_ARCH @@ -445,7 +457,7 @@ patchRegs instr env = case instr of #endif /* alpha_TARGET_ARCH */ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#if i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH patchRegs instr env = case instr of MOV sz src dst -> patch2 (MOV sz) src dst @@ -477,6 +489,7 @@ patchRegs instr env = case instr of JMP op -> patch1 JMP op JMP_TBL op ids -> patch1 JMP_TBL op $ ids +#if i386_TARGET_ARCH GMOV src dst -> GMOV (env src) (env dst) GLD sz src dst -> GLD sz (lookupAddr src) (env dst) GST sz src dst -> GST sz (env src) (lookupAddr dst) @@ -502,6 +515,17 @@ patchRegs instr env = case instr of GSIN sz src dst -> GSIN sz (env src) (env dst) GCOS sz src dst -> GCOS sz (env src) (env dst) GTAN sz src dst -> GTAN sz (env src) (env dst) +#endif + +#if x86_64_TARGET_ARCH + CVTSS2SD src dst -> CVTSS2SD (env src) (env dst) + CVTSD2SS src dst -> CVTSD2SS (env src) (env dst) + CVTSS2SI src dst -> CVTSS2SI (patchOp src) (env dst) + CVTSD2SI src dst -> CVTSD2SI (patchOp src) (env dst) + CVTSI2SS src dst -> CVTSI2SS (patchOp src) (env dst) + CVTSI2SD src dst -> CVTSI2SD (patchOp src) (env dst) + FDIV sz src dst -> FDIV sz (patchOp src) (patchOp dst) +#endif CALL (Left imm) -> instr CALL (Right reg) -> CALL (Right (env reg)) @@ -512,7 +536,7 @@ patchRegs instr env = case instr of COMMENT _ -> instr DELTA _ -> instr JXX _ _ -> instr - CLTD -> instr + CLTD _ -> instr _other -> panic "patchRegs: unrecognised instr" @@ -534,7 +558,7 @@ patchRegs instr env = case instr of lookupIndex Nothing = Nothing lookupIndex (Just (r,i)) = Just (env r, i) -#endif /* i386_TARGET_ARCH */ +#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH*/ -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if sparc_TARGET_ARCH @@ -645,7 +669,7 @@ patchRegs instr env = case instr of -- by assigning the src and dest temporaries to the same real register. isRegRegMove :: Instr -> Maybe (Reg,Reg) -#ifdef i386_TARGET_ARCH +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- TMP: isRegRegMove (MOV _ (OpReg r1) (OpReg r2)) = Just (r1,r2) #elif powerpc_TARGET_ARCH @@ -678,6 +702,12 @@ mkSpillInstr reg delta slot RcInteger -> MOV I32 (OpReg reg) (OpAddr (spRel off_w)) _ -> GST F80 reg (spRel off_w) {- RcFloat/RcDouble -} #endif +#ifdef x86_64_TARGET_ARCH + let off_w = (off-delta) `div` 8 + in case regClass reg of + RcInteger -> MOV I64 (OpReg reg) (OpAddr (spRel off_w)) + _ -> panic "mkSpillInstr: ToDo" +#endif #ifdef sparc_TARGET_ARCH {-SPARC: spill below frame pointer leaving 2 words/spill-} let{off_w = 1 + (off `div` 4); @@ -705,16 +735,22 @@ mkLoadInstr reg delta slot let off = spillSlotToOffset slot in -#ifdef alpha_TARGET_ARCH +#if alpha_TARGET_ARCH LD sz dyn (spRel (- (off `div` 8))) #endif -#ifdef i386_TARGET_ARCH +#if i386_TARGET_ARCH let off_w = (off-delta) `div` 4 in case regClass reg of { RcInteger -> MOV I32 (OpAddr (spRel off_w)) (OpReg reg); _ -> GLD F80 (spRel off_w) reg} {- RcFloat/RcDouble -} #endif -#ifdef sparc_TARGET_ARCH +#if x86_64_TARGET_ARCH + let off_w = (off-delta) `div` 8 + in case regClass reg of + RcInteger -> MOV I64 (OpAddr (spRel off_w)) (OpReg reg) + _ -> panic "mkLoadInstr: ToDo" +#endif +#if sparc_TARGET_ARCH let{off_w = 1 + (off `div` 4); sz = case regClass vreg of { RcInteger -> W; @@ -722,7 +758,7 @@ mkLoadInstr reg delta slot RcDouble -> DF}} in LD sz (fpRel (- off_w)) dyn #endif -#ifdef powerpc_TARGET_ARCH +#if powerpc_TARGET_ARCH let sz = case regClass reg of RcInteger -> I32 RcDouble -> F64 -- 1.7.10.4