[project @ 2005-05-21 15:39:00 by panne]

[ghc-hetmet.git] / ghc / compiler / nativeGen / MachCodeGen.hs
diff --git a/ghc/compiler/nativeGen/MachCodeGen.hs b/ghc/compiler/nativeGen/MachCodeGen.hs

index 22bd60d..24e8b04 100644 (file)
--- a/ghc/compiler/nativeGen/MachCodeGen.hs
+++ b/ghc/compiler/nativeGen/MachCodeGen.hs
@@ -15,6 +15,7 @@ module MachCodeGen ( cmmTopCodeGen, InstrBlock ) where
  
  #include "HsVersions.h"
  #include "nativeGen/NCG.h"
+#include "MachDeps.h"
  
  -- NCG stuff:
  import MachInstrs
@@ -29,14 +30,14 @@ import MachOp
  import CLabel
  
  -- The rest:
-import CmdLineOpts     ( opt_PIC )
+import StaticFlags     ( opt_PIC )
  import ForeignCall     ( CCallConv(..) )
  import OrdList
  import Pretty
  import Outputable
-import qualified Outputable
  import FastString
  import FastTypes       ( isFastTrue )
+import Constants       ( wORD_SIZE )
  
  #ifdef DEBUG
  import Outputable      ( assertPanic )
@@ -102,15 +103,17 @@ stmtToInstrs stmt = case stmt of
  
      CmmAssign reg src
        | isFloatingRep kind -> assignReg_FltCode kind reg src
-      | wordRep == I32 && kind == I64
-                          -> assignReg_I64Code      reg src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64       -> assignReg_I64Code      reg src
+#endif
        | otherwise         -> assignReg_IntCode kind reg src
         where kind = cmmRegRep reg
  
      CmmStore addr src
        | isFloatingRep kind -> assignMem_FltCode kind addr src
-      | wordRep == I32 && kind == I64
-                        -> assignMem_I64Code      addr src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64     -> assignMem_I64Code      addr src
+#endif
        | otherwise       -> assignMem_IntCode kind addr src
         where kind = cmmExprRep src
  
@@ -157,9 +160,14 @@ data ChildCode64   -- a.k.a "Register64"
                         -- selection game are therefore that the returned
                         -- Reg may be modified
  
+#if WORD_SIZE_IN_BITS==32
  assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
  assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+#endif
+
+#ifndef x86_64_TARGET_ARCH
  iselExpr64        :: CmmExpr -> NatM ChildCode64
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -463,6 +471,21 @@ swizzleRegisterRep (Any _ codefn)     rep = Any rep codefn
  
  
  -- -----------------------------------------------------------------------------
+-- Utils based on getRegister, below
+
+-- The dual to getAnyReg: compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+       tmp <- getNewRegNat rep
+       return (tmp, code tmp)
+    Fixed _ reg code -> 
+       return (reg, code)
+
+-- -----------------------------------------------------------------------------
  -- Grab the Reg for a CmmReg
  
  getRegisterReg :: CmmReg -> Reg
@@ -489,17 +512,17 @@ getRegisterReg (CmmGlobal mid)
  
  getRegister :: CmmExpr -> NatM Register
  
+getRegister (CmmReg (CmmGlobal PicBaseReg))
+  = do
+      reg <- getPicBaseNat wordRep
+      return (Fixed wordRep reg nilOL)
+
  getRegister (CmmReg reg) 
    = return (Fixed (cmmRegRep reg) (getRegisterReg reg) nilOL)
  
  getRegister tree@(CmmRegOff _ _) 
    = getRegister (mangleIndexTree tree)
  
-getRegister CmmPicBaseReg
-  = do
-      reg <- getPicBaseNat wordRep
-      return (Fixed wordRep reg nilOL)
-
  -- end of machine-"independent" bit; here we go on the rest...
  
  #if alpha_TARGET_ARCH
@@ -769,6 +792,30 @@ getRegister (CmmLit (CmmFloat d F64))
      -- in
      return (Any F64 code)
  
+#endif /* i386_TARGET_ARCH */
+
+#if x86_64_TARGET_ARCH
+
+getRegister (CmmLit (CmmFloat 0.0 rep)) = do
+   let code dst = unitOL  (XOR rep (OpReg dst) (OpReg dst))
+       -- I don't know why there are xorpd, xorps, and pxor instructions.
+       -- They all appear to do the same thing --SDM
+   return (Any rep code)
+
+getRegister (CmmLit (CmmFloat f rep)) = do
+    lbl <- getNewLabelNat
+    let code dst = toOL [
+           LDATA ReadOnlyData
+                       [CmmDataLabel lbl,
+                        CmmStaticLit (CmmFloat f rep)],
+           MOV rep (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+           ]
+    -- in
+    return (Any rep code)
+
+#endif /* x86_64_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- catch simple cases of zero- or sign-extended load
  getRegister (CmmMachOp (MO_U_Conv I8 I32) [CmmLoad addr _]) = do
@@ -787,11 +834,87 @@ getRegister (CmmMachOp (MO_S_Conv I16 I32) [CmmLoad addr _]) = do
    code <- intLoadCode (MOVSxL I16) addr
    return (Any I32 code)
  
+#endif
+
+#if x86_64_TARGET_ARCH
+
+-- catch simple cases of zero- or sign-extended load
+getRegister (CmmMachOp (MO_U_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOV I32) addr -- 32-bit loads zero-extend
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I32) addr
+  return (Any I64 code)
+
+#endif
+
+#if x86_64_TARGET_ARCH
+getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
+  x_code <- getAnyReg x
+  lbl <- getNewLabelNat
+  let
+    code dst = x_code dst `appOL` toOL [
+       -- This is how gcc does it, so it can't be that bad:
+       LDATA ReadOnlyData16 [
+               CmmAlign 16,
+               CmmDataLabel lbl,
+               CmmStaticLit (CmmInt 0x80000000 I32),
+               CmmStaticLit (CmmInt 0 I32),
+               CmmStaticLit (CmmInt 0 I32),
+               CmmStaticLit (CmmInt 0 I32)
+       ],
+       XOR F32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+               -- xorps, so we need the 128-bit constant
+               -- ToDo: rip-relative
+       ]
+  --
+  return (Any F32 code)
+
+getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do
+  x_code <- getAnyReg x
+  lbl <- getNewLabelNat
+  let
+       -- This is how gcc does it, so it can't be that bad:
+    code dst = x_code dst `appOL` toOL [
+       LDATA ReadOnlyData16 [
+               CmmAlign 16,
+               CmmDataLabel lbl,
+               CmmStaticLit (CmmInt 0x8000000000000000 I64),
+               CmmStaticLit (CmmInt 0 I64)
+       ],
+               -- gcc puts an unpck here.  Wonder if we need it.
+       XOR F64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+               -- xorpd, so we need the 128-bit constant
+       ]
+  --
+  return (Any F64 code)
+#endif
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  getRegister (CmmMachOp mop [x]) -- unary MachOps
    = case mop of
+#if i386_TARGET_ARCH
        MO_S_Neg F32 -> trivialUFCode F32 (GNEG F32) x
        MO_S_Neg F64 -> trivialUFCode F64 (GNEG F64) x
+#endif
  
        MO_S_Neg rep -> trivialUCode rep (NEGI rep) x
        MO_Not rep   -> trivialUCode rep (NOT  rep) x
@@ -805,6 +928,15 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
        MO_S_Conv I16 I8  -> conversionNop I16 x
        MO_U_Conv I32 I16 -> conversionNop I32 x
        MO_S_Conv I32 I16 -> conversionNop I32 x
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I64 I32 -> conversionNop I64 x
+      MO_S_Conv I64 I32 -> conversionNop I64 x
+      MO_U_Conv I64 I16 -> conversionNop I64 x
+      MO_S_Conv I64 I16 -> conversionNop I64 x
+      MO_U_Conv I64 I8  -> conversionNop I64 x
+      MO_S_Conv I64 I8  -> conversionNop I64 x
+#endif
+
        MO_U_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
        MO_S_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
  
@@ -817,12 +949,32 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
        MO_S_Conv I16 I32 -> integerExtend I16 I32 MOVSxL x
        MO_S_Conv I8  I16 -> integerExtend I8  I16 MOVSxL x
  
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I8  I64 -> integerExtend I8  I64 MOVZxL x
+      MO_U_Conv I16 I64 -> integerExtend I16 I64 MOVZxL x
+      MO_U_Conv I32 I64 -> integerExtend I32 I64 MOVZxL x
+      MO_S_Conv I8  I64 -> integerExtend I8  I64 MOVSxL x
+      MO_S_Conv I16 I64 -> integerExtend I16 I64 MOVSxL x
+      MO_S_Conv I32 I64 -> integerExtend I32 I64 MOVSxL x
+       -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+       -- However, we don't want the register allocator to throw it
+       -- away as an unnecessary reg-to-reg move, so we keep it in
+       -- the form of a movzl and print it as a movl later.
+#endif
+
+#if i386_TARGET_ARCH
        MO_S_Conv F32 F64 -> conversionNop F64 x
        MO_S_Conv F64 F32 -> conversionNop F32 x
+#else
+      MO_S_Conv F32 F64 -> coerceFP2FP F64 x
+      MO_S_Conv F64 F32 -> coerceFP2FP F32 x
+#endif
+
        MO_S_Conv from to
         | isFloatingRep from -> coerceFP2Int from to x
         | isFloatingRep to   -> coerceInt2FP from to x
  
+      other -> pprPanic "getRegister" (pprMachOp mop)
     where
         -- signed or unsigned extension.
         integerExtend from to instr expr = do
@@ -869,14 +1021,27 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
        MO_U_Lt rep -> condIntReg LU  x y
        MO_U_Le rep -> condIntReg LEU x y
  
-      MO_Add F32 -> trivialFCode F32  GADD x y
-      MO_Sub F32 -> trivialFCode F32  GSUB x y
+#if i386_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 GADD x y
+      MO_Sub F32 -> trivialFCode F32 GSUB x y
  
        MO_Add F64 -> trivialFCode F64 GADD x y
        MO_Sub F64 -> trivialFCode F64 GSUB x y
  
-      MO_S_Quot F32 -> trivialFCode  F32  GDIV x y
+      MO_S_Quot F32 -> trivialFCode F32 GDIV x y
        MO_S_Quot F64 -> trivialFCode F64 GDIV x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 ADD x y
+      MO_Sub F32 -> trivialFCode F32 SUB x y
+
+      MO_Add F64 -> trivialFCode F64 ADD x y
+      MO_Sub F64 -> trivialFCode F64 SUB x y
+
+      MO_S_Quot F32 -> trivialFCode F32 FDIV x y
+      MO_S_Quot F64 -> trivialFCode F64 FDIV x y
+#endif
  
        MO_Add rep -> add_code rep x y
        MO_Sub rep -> sub_code rep x y
@@ -886,8 +1051,16 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
        MO_U_Quot rep -> div_code rep False True  x y
        MO_U_Rem  rep -> div_code rep False False x y
  
+#if i386_TARGET_ARCH
        MO_Mul   F32 -> trivialFCode F32 GMUL x y
        MO_Mul   F64 -> trivialFCode F64 GMUL x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Mul   F32 -> trivialFCode F32 MUL x y
+      MO_Mul   F64 -> trivialFCode F64 MUL x y
+#endif
+
        MO_Mul   rep -> let op = IMUL rep in 
                       trivialCode rep op (Just op) x y
  
@@ -912,24 +1085,26 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
    where
      --------------------
      imulMayOflo :: MachRep -> CmmExpr -> CmmExpr -> NatM Register
-    imulMayOflo I32 a b = do
-         res_lo <- getNewRegNat I32
-         res_hi <- getNewRegNat I32
+    imulMayOflo rep a b = do
           (a_reg, a_code) <- getNonClobberedReg a
-         (b_reg, b_code) <- getSomeReg   b
+         b_code <- getAnyReg b
           let 
-             code dst = a_code `appOL` b_code `appOL`
+            shift_amt  = case rep of
+                          I32 -> 31
+                          I64 -> 63
+                          _ -> panic "shift_amt"
+
+             code = a_code `appOL` b_code eax `appOL`
                          toOL [
-                           MOV I32 (OpReg a_reg) (OpReg res_hi),
-                           MOV I32 (OpReg b_reg) (OpReg res_lo),
-                           IMUL64 res_hi res_lo,               -- result in res_hi:res_lo
-                           SAR I32 (OpImm (ImmInt 31)) (OpReg res_lo), -- sign extend lower part
-                           SUB I32 (OpReg res_hi) (OpReg res_lo),      -- compare against upper
-                           MOV I32 (OpReg res_lo) (OpReg dst)
-                           -- dst==0 if high part == sign extended low part
+                          IMUL2 rep (OpReg a_reg),   -- result in %edx:%eax
+                           SAR rep (OpImm (ImmInt shift_amt)) (OpReg eax),
+                               -- sign extend lower part
+                           SUB rep (OpReg edx) (OpReg eax)
+                               -- compare against upper
+                           -- eax==0 if high part == sign extended low part
                          ]
           -- in
-        return (Any I32 code)
+        return (Fixed rep eax code)
  
      --------------------
      shift_code :: MachRep
@@ -977,17 +1152,17 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
             code dst
                 = x_code `snocOL`
                  LEA rep
-                       (OpAddr (AddrBaseIndex (Just x_reg) Nothing imm))
+                       (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
                          (OpReg dst)
         -- 
         return (Any rep code)
  
      ----------------------
      div_code rep signed quotient x y = do
-          (y_op, y_code) <- getOperand y -- cannot be clobbered
+          (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
            x_code <- getAnyReg x
            let
-            widen | signed    = CLTD
+            widen | signed    = CLTD rep
                    | otherwise = XOR rep (OpReg edx) (OpReg edx)
  
              instr | signed    = IDIV
@@ -1004,17 +1179,18 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
             return (Fixed rep result code)
  
  
-
  getRegister (CmmLoad mem pk)
    | isFloatingRep pk
    = do
      Amode src mem_code <- getAmode mem
      let
         code dst = mem_code `snocOL` 
-                  GLD pk src dst
+                  IF_ARCH_i386(GLD pk src dst,
+                               MOV pk (OpAddr src) (OpReg dst))
      --
      return (Any pk code)
  
+#if i386_TARGET_ARCH
  getRegister (CmmLoad mem pk)
    | pk /= I64
    = do 
@@ -1029,14 +1205,47 @@ getRegister (CmmLoad mem pk)
         -- we can't guarantee access to an 8-bit variant of every register
         -- (esi and edi don't have 8-bit variants), so to make things
         -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+#endif
+
+#if x86_64_TARGET_ARCH
+-- Simpler memory load code on x86_64
+getRegister (CmmLoad mem pk)
+  = do 
+    code <- intLoadCode (MOV pk) mem
+    return (Any pk code)
+#endif
  
  getRegister (CmmLit (CmmInt 0 rep))
    = let
+       -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
+       adj_rep = case rep of I64 -> I32; _ -> rep
+       rep1 = IF_ARCH_i386( rep, adj_rep ) 
         code dst 
-           = unitOL (XOR rep (OpReg dst) (OpReg dst))
+           = unitOL (XOR rep1 (OpReg dst) (OpReg dst))
      in
         return (Any rep code)
  
+#if x86_64_TARGET_ARCH
+  -- optimisation for loading small literals on x86_64: take advantage
+  -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+  -- instruction forms are shorter.
+getRegister (CmmLit lit) 
+  | I64 <- cmmLitRep lit, not (isBigLit lit)
+  = let 
+       imm = litToImm lit
+       code dst = unitOL (MOV I32 (OpImm imm) (OpReg dst))
+    in
+       return (Any I64 code)
+  where
+   isBigLit (CmmInt i I64) = i < 0 || i > 0xffffffff
+   isBigLit _ = False
+       -- note1: not the same as is64BitLit, because that checks for
+       -- signed literals that fit in 32 bits, but we want unsigned
+       -- literals here.
+       -- note2: all labels are small, because we're assuming the
+       -- small memory model (see gcc docs, -mcmodel=small).
+#endif
+
  getRegister (CmmLit lit)
    = let 
         rep = cmmLitRep lit
@@ -1045,7 +1254,7 @@ getRegister (CmmLit lit)
      in
         return (Any rep code)
  
-getRegister other = panic "getRegister(x86)"
+getRegister other = pprPanic "getRegister(x86)" (ppr other)
  
  
  intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
@@ -1065,22 +1274,13 @@ anyReg :: Register -> NatM (Reg -> InstrBlock)
  anyReg (Any _ code)          = return code
  anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
  
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-       tmp <- getNewRegNat rep
-       return (tmp, code tmp)
-    Fixed _ reg code -> 
-       return (reg, code)
-
  -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
  -- Fixed registers might not be byte-addressable, so we make sure we've
  -- got a temporary, inserting an extra reg copy if necessary.
  getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+#if x86_64_TARGET_ARCH
+getByteReg = getSomeReg -- all regs are byte-addressable on x86_64
+#else
  getByteReg expr = do
    r <- getRegister expr
    case r of
@@ -1094,6 +1294,7 @@ getByteReg expr = do
             return (tmp, code `snocOL` reg2reg rep reg tmp)
         -- ToDo: could optimise slightly by checking for byte-addressable
         -- real registers, but that will happen very rarely if at all.
+#endif
  
  -- Another variant: this time we want the result in a register that cannot
  -- be modified by code to evaluate an arbitrary expression.
@@ -1114,10 +1315,12 @@ getNonClobberedReg expr = do
  
  reg2reg :: MachRep -> Reg -> Reg -> Instr
  reg2reg rep src dst 
+#if i386_TARGET_ARCH
    | isFloatingRep rep = GMOV src dst
+#endif
    | otherwise        = MOV rep (OpReg src) (OpReg dst)
  
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -1553,19 +1756,6 @@ extendSExpr rep x = CmmMachOp (MO_S_Conv rep I32) [x]
  extendUExpr I32 x = x
  extendUExpr rep x = CmmMachOp (MO_U_Conv rep I32) [x]
  
--- ###FIXME: exact code duplication from x86 case
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-       tmp <- getNewRegNat rep
-       return (tmp, code tmp)
-    Fixed _ reg code -> 
-       return (reg, code)
-
  #endif /* powerpc_TARGET_ARCH */
  
  
@@ -1639,21 +1829,23 @@ getAmode other
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- This is all just ridiculous, since it carefully undoes 
  -- what mangleIndexTree has just done.
-getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (-(fromInteger i))
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
    
-getAmode (CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (fromInteger i)
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
  
  -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be 
  -- recognised by the next rule.
@@ -1671,17 +1863,17 @@ getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _)
         let
            code = x_code `appOL` y_code
             base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
-       return (Amode (AddrBaseIndex (Just x_reg) (Just (y_reg,base)) (ImmInt 0))
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt 0))
                 code)
  
-getAmode (CmmLit lit)
+getAmode (CmmLit lit) | not (is64BitLit lit)
    = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
  
  getAmode expr = do
    (reg,code) <- getSomeReg expr
-  return (Amode (AddrBaseIndex (Just reg) Nothing (ImmInt 0)) code)
+  return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
  
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -1795,29 +1987,38 @@ getAmode other
  -- -----------------------------------------------------------------------------
  -- getOperand: sometimes any operand will do.
  
--- getOperand gets a *safe* operand; that is, the value of the operand
--- will remain valid across the computation of an arbitrary expression,
--- unless the expression is computed directly into a register which
--- the operand refers to (see trivialCode where this function is used
--- for an example).
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
  
-#ifdef i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
-getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-getOperand (CmmLoad mem pk) 
-  | not (isFloatingRep pk) && pk /= I64 = do
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
+getNonClobberedOperand (CmmLit lit)
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
+getNonClobberedOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
+    return (OpImm (litToImm lit), nilOL)
+getNonClobberedOperand (CmmLoad mem pk) 
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
      Amode src mem_code <- getAmode mem
      (src',save_code) <- 
         if (amodeCouldBeClobbered src) 
                 then do
                    tmp <- getNewRegNat wordRep
-                  return (AddrBaseIndex (Just tmp) Nothing (ImmInt 0),
+                  return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
                            unitOL (LEA I32 (OpAddr src) (OpReg tmp)))
                 else
                    return (src, nilOL)
      return (OpAddr src', save_code `appOL` mem_code)
-
-getOperand e = do
+getNonClobberedOperand e = do
      (reg, code) <- getNonClobberedReg e
      return (OpReg reg, code)
  
@@ -1827,6 +2028,56 @@ amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode)
  regClobbered (RealReg rr) = isFastTrue (freeReg rr)
  regClobbered _ = False
  
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
+getOperand (CmmLit lit)
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
+getOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) = do
+    return (OpImm (litToImm lit), nilOL)
+getOperand (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getOperand e = do
+    (reg, code) <- getSomeReg e
+    return (OpReg reg, code)
+
+isOperand :: CmmExpr -> Bool
+isOperand (CmmLoad _ _) = True
+isOperand (CmmLit lit)  = not (is64BitLit lit)
+                         || isSuitableFloatingPointLit lit
+isOperand _             = False
+
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory.  However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getRegOrMem e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+#if x86_64_TARGET_ARCH
+is64BitLit (CmmInt i I64) = i > 0x7fffffff || i < -0x80000000
+   -- assume that labels are in the range 0-2^31-1: this assumes the
+   -- small memory model (see gcc docs, -mcmodel=small).
+#endif
+is64BitLit x = False
  #endif
  
  -- -----------------------------------------------------------------------------
@@ -1846,7 +2097,7 @@ getCondCode = panic "MachCode.getCondCode: not on Alphas"
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH || sparc_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH || sparc_TARGET_ARCH
  -- yes, they really do seem to want exactly the same!
  
  getCondCode (CmmMachOp mop [x, y])
@@ -1940,10 +2191,10 @@ condFltCode = panic "MachCode.condFltCode: not on Alphas"
  #endif /* alpha_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- memory vs immediate
-condIntCode cond (CmmLoad x pk) (CmmLit lit) = do
+condIntCode cond (CmmLoad x pk) (CmmLit lit) | not (is64BitLit lit) = do
      Amode x_addr x_code <- getAmode x
      let
         imm  = litToImm lit
@@ -1961,50 +2212,29 @@ condIntCode cond x (CmmLit (CmmInt 0 pk)) = do
      --
      return (CondCode False cond code)
  
--- anything vs immediate
-condIntCode cond x (CmmLit lit) = do
-    (x_reg, x_code) <- getSomeReg x
+-- anything vs operand
+condIntCode cond x y | isOperand y = do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op,  y_code) <- getOperand y    
      let
-       imm  = litToImm lit
-       code = x_code `snocOL`
-                  CMP (cmmLitRep lit) (OpImm imm) (OpReg x_reg)
-    -- in
-    return (CondCode False cond code)
-
--- memory vs anything
-condIntCode cond (CmmLoad x pk) y = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-       code = y_code `appOL`
-               x_code `snocOL`
-                 CMP pk (OpReg y_reg) (OpAddr x_addr)
-    -- in
-    return (CondCode False cond code)
-
--- anything vs memory
-condIntCode cond y (CmmLoad x pk) = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-       code = y_code `appOL`
-               x_code `snocOL`
-                 CMP pk (OpAddr x_addr) (OpReg y_reg)
+       code = x_code `appOL` y_code `snocOL`
+                  CMP (cmmExprRep x) y_op (OpReg x_reg)
      -- in
      return (CondCode False cond code)
  
  -- anything vs anything
  condIntCode cond x y = do
-  (x_op, x_code) <- getOperand x
-  (y_reg, y_code) <- getSomeReg y
+  (y_reg, y_code) <- getNonClobberedReg y
+  (x_op, x_code) <- getRegOrMem x
    let
-       code = x_code `appOL`
-              y_code `snocOL`
+       code = y_code `appOL`
+              x_code `snocOL`
                   CMP (cmmExprRep x) (OpReg y_reg) x_op
    -- in
    return (CondCode False cond code)
+#endif
  
------------
+#if i386_TARGET_ARCH
  condFltCode cond x y 
    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
    (x_reg, x_code) <- getNonClobberedReg x
@@ -2015,9 +2245,25 @@ condFltCode cond x y
    -- The GCMP insn does the test and sets the zero flag if comparable
    -- and true.  Hence we always supply EQQ as the condition to test.
    return (CondCode True EQQ code)
-
  #endif /* i386_TARGET_ARCH */
  
+#if x86_64_TARGET_ARCH
+-- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+-- an operand, but the right must be a reg.  We can probably do better
+-- than this general case...
+condFltCode cond x y = do
+  (x_reg, x_code) <- getNonClobberedReg x
+  (y_op, y_code) <- getOperand y
+  let
+       code = x_code `appOL`
+              y_code `snocOL`
+                 CMP (cmmExprRep x) y_op (OpReg x_reg)
+       -- NB(1): we need to use the unsigned comparison operators on the
+       -- result of this comparison.
+  -- in
+  return (CondCode True (condToUnsigned cond) code)
+#endif
+
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if sparc_TARGET_ARCH
@@ -2085,7 +2331,7 @@ condFltCode cond x y
  #endif /* sparc_TARGET_ARCH */
  
  #if powerpc_TARGET_ARCH
--- ###FIXME: I16 and I8!
+--  ###FIXME: I16 and I8!
  condIntCode cond x (CmmLit (CmmInt y rep))
    | Just src2 <- makeImmediate rep (not $ condUnsigned cond) y
    = do
@@ -2171,7 +2417,7 @@ assignIntCode pk dst src
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- integer assignment to memory
  assignMem_IntCode pk addr src = do
@@ -2189,7 +2435,7 @@ assignMem_IntCode pk addr src = do
      return code
    where
      get_op_RI :: CmmExpr -> NatM (InstrBlock,Operand)  -- code, operator
-    get_op_RI (CmmLit lit)
+    get_op_RI (CmmLit lit) | not (is64BitLit lit)
        = return (nilOL, OpImm (litToImm lit))
      get_op_RI op
        = do (reg,code) <- getNonClobberedReg op
@@ -2298,7 +2544,7 @@ assignFltCode pk dst src
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- Floating point assignment to memory
  assignMem_FltCode pk addr src = do
@@ -2307,7 +2553,8 @@ assignMem_FltCode pk addr src = do
    let
         code = src_code `appOL`
                addr_code `snocOL`
-                GST pk src_reg addr
+                IF_ARCH_i386(GST pk src_reg addr,
+                            MOV pk (OpReg src_reg) (OpAddr addr))
    return code
  
  -- Floating point assignment to a register/temporary
@@ -2416,7 +2663,7 @@ genJump tree
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  genJump (CmmLoad mem pk) = do
    Amode target code <- getAmode mem
@@ -2471,7 +2718,7 @@ genBranch :: BlockId -> NatM InstrBlock
  genBranch id = return (unitOL (BR id))
  #endif
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  genBranch id = return (unitOL (JXX ALWAYS id))
  #endif
  
@@ -2667,8 +2914,45 @@ genCondJump id bool = do
    CondCode _ cond code <- getCondCode bool
    return (code `snocOL` JXX cond id)
  
-#endif /* i386_TARGET_ARCH */
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if x86_64_TARGET_ARCH
+
+genCondJump id bool = do
+  CondCode is_float cond cond_code <- getCondCode bool
+  if not is_float
+    then
+       return (cond_code `snocOL` JXX cond id)
+    else do
+       lbl <- getBlockIdNat
+
+       -- see comment with condFltReg
+       let code = case cond of
+                       NE  -> or_unordered
+                       GU  -> plain_test
+                       GEU -> plain_test
+                       _   -> and_ordered
+
+           plain_test = unitOL (
+                 JXX cond id
+               )
+           or_unordered = toOL [
+                 JXX cond id,
+                 JXX PARITY id
+               ]
+           and_ordered = toOL [
+                 JXX PARITY lbl,
+                 JXX cond id,
+                 JXX ALWAYS lbl,
+                 NEWBLOCK lbl
+               ]
+       return (cond_code `appOL` code)
+
+#endif
  
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if sparc_TARGET_ARCH
  
@@ -2823,19 +3107,19 @@ genCCall target dest_regs args vols = do
         -- CmmPrim -> ...
          CmmForeignCall (CmmLit (CmmLabel lbl)) conv
             -> -- ToDo: stdcall arg sizes
-             return (unitOL (CALL (Left fn_imm)), conv)
+             return (unitOL (CALL (Left fn_imm) []), conv)
            where fn_imm = ImmCLbl lbl
          CmmForeignCall expr conv
             -> do (dyn_c, dyn_r, dyn_rep) <- get_op expr
                   ASSERT(dyn_rep == I32)
-                  return (dyn_c `snocOL` CALL (Right dyn_r), conv)
+                  return (dyn_c `snocOL` CALL (Right dyn_r) [], conv)
  
      let        push_code = concatOL push_codes
         call = callinsns `appOL`
                 toOL (
                         -- Deallocate parameters after call for ccall;
                         -- but not for stdcall (callee does it)
-                  (if cconv == StdCallConv then [] else 
+                  (if cconv == StdCallConv || tot_arg_size==0 then [] else 
                    [ADD I32 (OpImm (ImmInt tot_arg_size)) (OpReg esp)])
                    ++
                    [DELTA (delta + tot_arg_size)]
@@ -2895,8 +3179,8 @@ genCCall target dest_regs args vols = do
                          code `appOL`
                          toOL [SUB I32 (OpImm (ImmInt size)) (OpReg esp),
                                DELTA (delta-size),
-                              GST sz reg (AddrBaseIndex (Just esp) 
-                                                        Nothing 
+                              GST sz reg (AddrBaseIndex (EABaseReg esp) 
+                                                        EAIndexNone
                                                          (ImmInt 0))]
                         )
             else return (size,
@@ -2913,6 +3197,9 @@ genCCall target dest_regs args vols = do
          (reg,code) <- getSomeReg op
         return (code, reg, cmmExprRep op)
  
+#endif /* i386_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  outOfLineFloatOp :: CallishMachOp -> CmmReg -> [(CmmExpr,MachHint)]
    -> Maybe [GlobalReg] -> NatM InstrBlock
@@ -2936,6 +3223,10 @@ outOfLineFloatOp mop res args vols
         lbl = CmmLabel (mkForeignLabel fn Nothing False)
  
         fn = case mop of
+             MO_F32_Sqrt  -> FSLIT("sqrt")
+             MO_F32_Sin   -> FSLIT("sin")
+             MO_F32_Cos   -> FSLIT("cos")
+             MO_F32_Tan   -> FSLIT("tan")
               MO_F32_Exp   -> FSLIT("exp")
               MO_F32_Log   -> FSLIT("log")
  
@@ -2948,6 +3239,10 @@ outOfLineFloatOp mop res args vols
               MO_F32_Tanh  -> FSLIT("tanh")
               MO_F32_Pwr   -> FSLIT("pow")
  
+             MO_F64_Sqrt  -> FSLIT("sqrt")
+             MO_F64_Sin   -> FSLIT("sin")
+             MO_F64_Cos   -> FSLIT("cos")
+             MO_F64_Tan   -> FSLIT("tan")
               MO_F64_Exp   -> FSLIT("exp")
               MO_F64_Log   -> FSLIT("log")
  
@@ -2960,9 +3255,167 @@ outOfLineFloatOp mop res args vols
               MO_F64_Tanh  -> FSLIT("tanh")
               MO_F64_Pwr   -> FSLIT("pow")
  
-              other -> pprPanic "outOfLineFloatOp" (pprCallishMachOp mop)
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
-#endif /* i386_TARGET_ARCH */
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if x86_64_TARGET_ARCH
+
+genCCall (CmmPrim op) [(r,_)] args vols = 
+  outOfLineFloatOp op r args vols
+
+genCCall target dest_regs args vols = do
+
+       -- load up the register arguments
+    (stack_args, aregs, fregs, load_args_code)
+        <- load_args args allArgRegs allFPArgRegs nilOL
+
+    let
+       fp_regs_used  = reverse (drop (length fregs) (reverse allFPArgRegs))
+       int_regs_used = reverse (drop (length aregs) (reverse allArgRegs))
+       arg_regs = int_regs_used ++ fp_regs_used
+               -- for annotating the call instruction with
+
+       sse_regs = length fp_regs_used
+
+       tot_arg_size = arg_size * length stack_args
+
+       -- On entry to the called function, %rsp should be aligned
+       -- on a 16-byte boundary +8 (i.e. the first stack arg after
+       -- the return address is 16-byte aligned).  In STG land
+       -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
+       -- need to make sure we push a multiple of 16-bytes of args,
+       -- plus the return address, to get the correct alignment.
+       -- Urg, this is hard.  We need to feed the delta back into
+       -- the arg pushing code.
+    (real_size, adjust_rsp) <-
+       if tot_arg_size `rem` 16 == 0
+           then return (tot_arg_size, nilOL)
+           else do -- we need to adjust...
+               delta <- getDeltaNat
+               setDeltaNat (delta-8)
+               return (tot_arg_size+8, toOL [
+                               SUB I64 (OpImm (ImmInt 8)) (OpReg rsp),
+                               DELTA (delta-8)
+                       ])
+
+       -- push the stack args, right to left
+    push_code <- push_args (reverse stack_args) nilOL
+    delta <- getDeltaNat
+
+    -- deal with static vs dynamic call targets
+    (callinsns,cconv) <-
+      case target of
+       -- CmmPrim -> ...
+        CmmForeignCall (CmmLit (CmmLabel lbl)) conv
+           -> -- ToDo: stdcall arg sizes
+             return (unitOL (CALL (Left fn_imm) arg_regs), conv)
+          where fn_imm = ImmCLbl lbl
+        CmmForeignCall expr conv
+           -> do (dyn_r, dyn_c) <- getSomeReg expr
+                return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+
+    let
+       -- The x86_64 ABI requires us to set %al to the number of SSE
+       -- registers that contain arguments, if the called routine
+       -- is a varargs function.  We don't know whether it's a
+       -- varargs function or not, so we have to assume it is.
+       --
+       -- It's not safe to omit this assignment, even if the number
+       -- of SSE regs in use is zero.  If %al is larger than 8
+       -- on entry to a varargs function, seg faults ensue.
+       assign_eax n = unitOL (MOV I32 (OpImm (ImmInt n)) (OpReg eax))
+
+    let call = callinsns `appOL`
+               toOL (
+                       -- Deallocate parameters after call for ccall;
+                       -- but not for stdcall (callee does it)
+                  (if cconv == StdCallConv || real_size==0 then [] else 
+                  [ADD wordRep (OpImm (ImmInt real_size)) (OpReg esp)])
+                  ++
+                  [DELTA (delta + real_size)]
+               )
+    -- in
+    setDeltaNat (delta + real_size)
+
+    let
+       -- assign the results, if necessary
+       assign_code []     = nilOL
+       assign_code [(dest,_hint)] = 
+         case rep of
+               F32 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+               F64 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+               rep -> unitOL (MOV rep (OpReg rax) (OpReg r_dest))
+         where 
+               rep = cmmRegRep dest
+               r_dest = getRegisterReg dest
+       assign_code many = panic "genCCall.assign_code many"
+
+    return (load_args_code     `appOL` 
+           adjust_rsp          `appOL`
+           push_code           `appOL`
+           assign_eax sse_regs `appOL`
+           call                `appOL` 
+           assign_code dest_regs)
+
+  where
+    arg_size = 8 -- always, at the mo
+
+    load_args :: [(CmmExpr,MachHint)]
+             -> [Reg]                  -- int regs avail for args
+             -> [Reg]                  -- FP regs avail for args
+             -> InstrBlock
+             -> NatM ([(CmmExpr,MachHint)],[Reg],[Reg],InstrBlock)
+    load_args args [] [] code     =  return (args, [], [], code)
+       -- no more regs to use
+    load_args [] aregs fregs code =  return ([], aregs, fregs, code)
+       -- no more args to push
+    load_args ((arg,hint) : rest) aregs fregs code
+       | isFloatingRep arg_rep = 
+       case fregs of
+         [] -> push_this_arg
+         (r:rs) -> do
+            arg_code <- getAnyReg arg
+            load_args rest aregs rs (code `appOL` arg_code r)
+       | otherwise =
+       case aregs of
+         [] -> push_this_arg
+         (r:rs) -> do
+            arg_code <- getAnyReg arg
+            load_args rest rs fregs (code `appOL` arg_code r)
+       where
+         arg_rep = cmmExprRep arg
+
+         push_this_arg = do
+           (args',ars,frs,code') <- load_args rest aregs fregs code
+           return ((arg,hint):args', ars, frs, code')
+
+    push_args [] code = return code
+    push_args ((arg,hint):rest) code
+       | isFloatingRep arg_rep = do
+        (arg_reg, arg_code) <- getSomeReg arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+        let code' = code `appOL` toOL [
+                       MOV arg_rep (OpReg arg_reg) (OpAddr  (spRel 0)),
+                       SUB wordRep (OpImm (ImmInt arg_size)) (OpReg rsp) ,
+                       DELTA (delta-arg_size)]
+        push_args rest code'
+
+       | otherwise = do
+       -- we only ever generate word-sized function arguments.  Promotion
+       -- has already happened: our Int8# type is kept sign-extended
+       -- in an Int#, for example.
+        ASSERT(arg_rep == I64) return ()
+        (arg_op, arg_code) <- getOperand arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+        let code' = code `appOL` toOL [PUSH I64 arg_op, 
+                                       DELTA (delta-arg_size)]
+        push_args rest code'
+       where
+         arg_rep = cmmExprRep arg
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -3132,6 +3585,10 @@ genCCall fn cconv kind args
      * SysV insists on either passing I64 arguments on the stack, or in two GPRs,
        starting with an odd-numbered GPR. It may skip a GPR to achieve this.
        Darwin just treats an I64 like two separate I32s (high word first).
+    * I64 and F64 arguments are 8-byte aligned on the stack for SysV, but only
+      4-byte aligned like everything else on Darwin.
+    * The SysV spec claims that F32 is represented as F64 on the stack. GCC on
+      PowerPC Linux does not agree, so neither do we.
        
      According to both conventions, The parameter area should be part of the
      caller's stack frame, allocated in the caller's prologue code (large enough
@@ -3175,17 +3632,17 @@ genCCall target dest_regs argsAndHints vols
  #if darwin_TARGET_OS
          initialStackOffset = 24
             -- size of linkage area + size of arguments, in bytes       
-       stackDelta _finalStack = roundTo16 $ (24 +) $ max 32 $ sum $
+       stackDelta _finalStack = roundTo 16 $ (24 +) $ max 32 $ sum $
                                        map machRepByteWidth argReps
  #elif linux_TARGET_OS
          initialStackOffset = 8
-        stackDelta finalStack = roundTo16 finalStack
+        stackDelta finalStack = roundTo 16 finalStack
  #endif
         args = map fst argsAndHints
         argReps = map cmmExprRep args
  
-       roundTo16 x | x `mod` 16 == 0 = x
-                   | otherwise = x + 16 - (x `mod` 16)
+       roundTo a x | x `mod` a == 0 = x
+                   | otherwise = x + a - (x `mod` a)
  
          move_sp_down finalStack
                 | delta > 64 =
@@ -3222,9 +3679,10 @@ genCCall target dest_regs argsAndHints vols
                  storeWord vr [] offset = ST I32 vr (AddrRegImm sp (ImmInt offset))
                  
  #elif linux_TARGET_OS
-                let stackCode = accumCode `appOL` code
-                        `snocOL` ST I32 vr_hi (AddrRegImm sp (ImmInt stackOffset))
-                        `snocOL` ST I32 vr_lo (AddrRegImm sp (ImmInt (stackOffset+4)))
+                let stackOffset' = roundTo 8 stackOffset
+                    stackCode = accumCode `appOL` code
+                        `snocOL` ST I32 vr_hi (AddrRegImm sp (ImmInt stackOffset'))
+                        `snocOL` ST I32 vr_lo (AddrRegImm sp (ImmInt (stackOffset'+4)))
                      regCode hireg loreg =
                          accumCode `appOL` code
                              `snocOL` MR hireg vr_hi
@@ -3238,7 +3696,7 @@ genCCall target dest_regs argsAndHints vols
                          passArguments args regs fprs stackOffset
                                        (regCode hireg loreg) (hireg : loreg : accumUsed)
                      _ -> -- only one or no regs left
-                        passArguments args [] fprs (stackOffset+8)
+                        passArguments args [] fprs (stackOffset'+8)
                                        stackCode accumUsed
  #endif
          
@@ -3265,11 +3723,20 @@ genCCall target dest_regs argsAndHints vols
                  passArguments args
                                (drop nGprs gprs)
                                (drop nFprs fprs)
-                              (stackOffset + stackBytes)
+                              (stackOffset' + stackBytes)
                                (accumCode `appOL` code `snocOL` ST rep vr stackSlot)
                                accumUsed
              where
-                stackSlot = AddrRegImm sp (ImmInt stackOffset)
+#if darwin_TARGET_OS
+        -- stackOffset is at least 4-byte aligned
+        -- The Darwin ABI is happy with that.
+                stackOffset' = stackOffset
+#else
+        -- ... the SysV ABI requires 8-byte alignment for doubles.
+                stackOffset' | rep == F64 = roundTo 8 stackOffset
+                             | otherwise  =           stackOffset
+#endif
+                stackSlot = AddrRegImm sp (ImmInt stackOffset')
                  (nGprs, nFprs, stackBytes, regs) = case rep of
                      I32 -> (1, 0, 4, gprs)
  #if darwin_TARGET_OS
@@ -3351,13 +3818,13 @@ genCCall target dest_regs argsAndHints vols
  
  genSwitch :: CmmExpr -> [Maybe BlockId] -> NatM InstrBlock
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  genSwitch expr ids = do
    (reg,e_code) <- getSomeReg expr
    lbl <- getNewLabelNat
    let
         jumpTable = map jumpTableEntry ids
-       op = OpAddr (AddrBaseIndex Nothing (Just (reg,4)) (ImmCLbl lbl))
+       op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
         code = e_code `appOL` toOL [
                 LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                 JMP_TBL op [ id | Just id <- ids ]
@@ -3374,12 +3841,19 @@ genSwitch expr ids
          dynRef <- cmmMakeDynamicReference addImportNat False lbl
          (tableReg,t_code) <- getSomeReg $ dynRef
          let
-            jumpTable = map jumpTableEntry ids
-        
+            jumpTable = map jumpTableEntryRel ids
+            
+            jumpTableEntryRel Nothing
+                = CmmStaticLit (CmmInt 0 wordRep)
+            jumpTableEntryRel (Just (BlockId id))
+                = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
+                where blockLabel = mkAsmTempLabel id
+
              code = e_code `appOL` t_code `appOL` toOL [
                              LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                              SLW tmp reg (RIImm (ImmInt 2)),
                              LD I32 tmp (AddrRegReg tableReg tmp),
+                            ADD tmp tmp (RIReg tableReg),
                              MTCTR tmp,
                              BCTR [ id | Just id <- ids ]
                      ]
@@ -3434,7 +3908,7 @@ condFltReg = panic "MachCode.condFltReg (not on Alpha)"
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  condIntReg cond x y = do
    CondCode _ cond cond_code <- condIntCode cond x y
@@ -3442,38 +3916,77 @@ condIntReg cond x y = do
    let 
         code dst = cond_code `appOL` toOL [
                     SETCC cond (OpReg tmp),
-                   MOV I32 (OpReg tmp) (OpReg dst),
-                   AND I32 (OpImm (ImmInt 1)) (OpReg dst)
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
                   ]
-                  -- NB. (1) Tha AND is needed here because the x86 only
-                  -- sets the low byte in the SETCC instruction.
-                  -- NB. (2) The extra temporary register is a hack to
-                  -- work around the fact that the setcc instructions only
-                  -- accept byte registers.  dst might not be a byte-able reg,
-                  -- but currently all free registers are byte-able, so we're
-                  -- guaranteed that a new temporary is byte-able.
    -- in
    return (Any I32 code)
  
+#endif
+
+#if i386_TARGET_ARCH
  
  condFltReg cond x y = do
-  lbl1 <- getBlockIdNat
-  lbl2 <- getBlockIdNat
    CondCode _ cond cond_code <- condFltCode cond x y
-  let
-       code dst = cond_code `appOL` toOL [
-           JXX cond lbl1,
-           MOV I32 (OpImm (ImmInt 0)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl1,
-           MOV I32 (OpImm (ImmInt 1)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl2]
-               -- SIGH, have to split up this block somehow...
+  tmp <- getNewRegNat I8
+  let 
+       code dst = cond_code `appOL` toOL [
+                   SETCC cond (OpReg tmp),
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
+                 ]
    -- in
    return (Any I32 code)
  
-#endif /* i386_TARGET_ARCH */
+#endif
+
+#if x86_64_TARGET_ARCH
+
+condFltReg cond x y = do
+  CondCode _ cond cond_code <- condFltCode cond x y
+  tmp1 <- getNewRegNat wordRep
+  tmp2 <- getNewRegNat wordRep
+  let 
+       -- We have to worry about unordered operands (eg. comparisons
+       -- against NaN).  If the operands are unordered, the comparison
+       -- sets the parity flag, carry flag and zero flag.
+       -- All comparisons are supposed to return false for unordered
+       -- operands except for !=, which returns true.
+       --
+       -- Optimisation: we don't have to test the parity flag if we
+       -- know the test has already excluded the unordered case: eg >
+       -- and >= test for a zero carry flag, which can only occur for
+       -- ordered operands.
+       --
+       -- ToDo: by reversing comparisons we could avoid testing the
+       -- parity flag in more cases.
+
+       code dst = 
+          cond_code `appOL` 
+            (case cond of
+               NE  -> or_unordered dst
+               GU  -> plain_test   dst
+               GEU -> plain_test   dst
+               _   -> and_ordered  dst)
+
+       plain_test dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   MOVZxL I8 (OpReg tmp1) (OpReg dst)
+                ]
+       or_unordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC PARITY (OpReg tmp2),
+                   OR I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+       and_ordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC NOTPARITY (OpReg tmp2),
+                   AND I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+  -- in
+  return (Any I32 code)
+
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -3637,9 +4150,11 @@ trivialCode
      -> IF_ARCH_alpha((Reg -> RI -> Reg -> Instr)
        ,IF_ARCH_i386 ((Operand -> Operand -> Instr) 
                       -> Maybe (Operand -> Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Operand -> Instr) 
+                     -> Maybe (Operand -> Operand -> Instr)
        ,IF_ARCH_sparc((Reg -> RI -> Reg -> Instr)
        ,IF_ARCH_powerpc(Bool -> (Reg -> Reg -> RI -> Instr)
-      ,))))
+      ,)))))
      -> CmmExpr -> CmmExpr -- the two arguments
      -> NatM Register
  
@@ -3649,7 +4164,8 @@ trivialFCode
      -> IF_ARCH_alpha((Reg -> Reg -> Reg -> Instr)
        ,IF_ARCH_sparc((Size -> Reg -> Reg -> Reg -> Instr)
        ,IF_ARCH_i386 ((MachRep -> Reg -> Reg -> Reg -> Instr)
-      ,)))
+      ,IF_ARCH_x86_64 ((MachRep -> Operand -> Operand -> Instr)
+      ,))))
      -> CmmExpr -> CmmExpr -- the two arguments
      -> NatM Register
  #endif
@@ -3658,9 +4174,10 @@ trivialUCode
      :: MachRep 
      -> IF_ARCH_alpha((RI -> Reg -> Instr)
        ,IF_ARCH_i386 ((Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Instr)
        ,IF_ARCH_sparc((RI -> Reg -> Instr)
        ,IF_ARCH_powerpc((Reg -> Reg -> Instr)
-      ,))))
+      ,)))))
      -> CmmExpr -- the one argument
      -> NatM Register
  
@@ -3669,8 +4186,9 @@ trivialUFCode
      :: MachRep
      -> IF_ARCH_alpha((Reg -> Reg -> Instr)
        ,IF_ARCH_i386 ((Reg -> Reg -> Instr)
+      ,IF_ARCH_x86_64 ((Reg -> Reg -> Instr)
        ,IF_ARCH_sparc((Reg -> Reg -> Instr)
-      ,)))
+      ,))))
      -> CmmExpr -- the one argument
      -> NatM Register
  #endif
@@ -3749,7 +4267,7 @@ trivialUFCode _ instr x
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  {-
  The Rules of the Game are:
@@ -3796,16 +4314,8 @@ SDM's version of The Rules:
    register happens to be the destination register.
  -}
  
-trivialCode rep instr maybe_revinstr a (CmmLit lit_b) = do
-  a_code <- getAnyReg a
-  let
-       code dst
-          = a_code dst `snocOL` 
-           instr (OpImm (litToImm lit_b)) (OpReg dst)
-  -- in
-  return (Any rep code)
-              
-trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
+trivialCode rep instr (Just revinstr) (CmmLit lit_a) b
+  | not (is64BitLit lit_a) = do
    b_code <- getAnyReg b
    let
         code dst 
@@ -3814,8 +4324,11 @@ trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
    -- in
    return (Any rep code)
  
-trivialCode rep instr maybe_revinstr a b = do
-  (b_op, b_code) <- getOperand b
+trivialCode rep instr maybe_revinstr a b = genTrivialCode rep instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode rep instr a b = do
+  (b_op, b_code) <- getNonClobberedOperand b
    a_code <- getAnyReg a
    tmp <- getNewRegNat rep
    let
@@ -3826,7 +4339,7 @@ trivialCode rep instr maybe_revinstr a b = do
       -- as the destination reg.  In this case, we have to save b in a
       -- new temporary across the computation of a.
       code dst
-       | dst `clashesWith` b_op =
+       | dst `regClashesWithOp` b_op =
                 b_code `appOL`
                 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
                 a_code dst `snocOL`
@@ -3837,9 +4350,10 @@ trivialCode rep instr maybe_revinstr a b = do
                 instr b_op (OpReg dst)
    -- in
    return (Any rep code)
- where
-  reg `clashesWith` OpReg reg2   = reg == reg2
-  reg `clashesWith` OpAddr amode = any (==reg) (addrModeRegs amode)
+
+reg `regClashesWithOp` OpReg reg2   = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+reg `regClashesWithOp` _            = False
  
  -----------
  
@@ -3854,6 +4368,8 @@ trivialUCode rep instr x = do
  
  -----------
  
+#if i386_TARGET_ARCH
+
  trivialFCode pk instr x y = do
    (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
    (y_reg, y_code) <- getSomeReg y
@@ -3865,6 +4381,14 @@ trivialFCode pk instr x y = do
    -- in
    return (Any pk code)
  
+#endif
+
+#if x86_64_TARGET_ARCH
+
+trivialFCode pk instr x y = genTrivialCode  pk (instr pk) x y
+
+#endif
+
  -------------
  
  trivialUFCode rep instr x = do
@@ -4055,7 +4579,7 @@ remainderCode rep div x y = do
  coerceInt2FP :: MachRep -> MachRep -> CmmExpr -> NatM Register
  coerceFP2Int :: MachRep -> MachRep -> CmmExpr -> NatM Register
  
-#ifdef sparc_TARGET_ARCH
+#if sparc_TARGET_ARCH
  coerceDbl2Flt :: CmmExpr -> NatM Register
  coerceFlt2Dbl :: CmmExpr -> NatM Register
  #endif
@@ -4123,6 +4647,37 @@ coerceFP2Int from to x = do
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
+#if x86_64_TARGET_ARCH
+
+coerceFP2Int from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case from of F32 -> CVTSS2SI; F64 -> CVTSD2SI
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceInt2FP from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case to of F32 -> CVTSI2SS; F64 -> CVTSI2SD
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceFP2FP :: MachRep -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+        opc  = case to of F32 -> CVTSD2SS; F64 -> CVTSS2SD
+        code dst = x_code `snocOL` opc x_reg dst
+  -- in
+  return (Any to code)
+
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
  #if sparc_TARGET_ARCH
  
  coerceInt2FP pk x