[project @ 2005-05-21 15:39:00 by panne]

[ghc-hetmet.git] / ghc / compiler / nativeGen / MachCodeGen.hs
diff --git a/ghc/compiler/nativeGen/MachCodeGen.hs b/ghc/compiler/nativeGen/MachCodeGen.hs

index 6f0b2f4..24e8b04 100644 (file)
--- a/ghc/compiler/nativeGen/MachCodeGen.hs
+++ b/ghc/compiler/nativeGen/MachCodeGen.hs
@@ -15,6 +15,7 @@ module MachCodeGen ( cmmTopCodeGen, InstrBlock ) where
  
  #include "HsVersions.h"
  #include "nativeGen/NCG.h"
+#include "MachDeps.h"
  
  -- NCG stuff:
  import MachInstrs
@@ -29,14 +30,14 @@ import MachOp
  import CLabel
  
  -- The rest:
-import CmdLineOpts     ( opt_PIC )
+import StaticFlags     ( opt_PIC )
  import ForeignCall     ( CCallConv(..) )
  import OrdList
  import Pretty
  import Outputable
-import qualified Outputable
  import FastString
  import FastTypes       ( isFastTrue )
+import Constants       ( wORD_SIZE )
  
  #ifdef DEBUG
  import Outputable      ( assertPanic )
@@ -102,15 +103,17 @@ stmtToInstrs stmt = case stmt of
  
      CmmAssign reg src
        | isFloatingRep kind -> assignReg_FltCode kind reg src
-      | wordRep == I32 && kind == I64
-                          -> assignReg_I64Code      reg src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64       -> assignReg_I64Code      reg src
+#endif
        | otherwise         -> assignReg_IntCode kind reg src
         where kind = cmmRegRep reg
  
      CmmStore addr src
        | isFloatingRep kind -> assignMem_FltCode kind addr src
-      | wordRep == I32 && kind == I64
-                        -> assignMem_I64Code      addr src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64     -> assignMem_I64Code      addr src
+#endif
        | otherwise       -> assignMem_IntCode kind addr src
         where kind = cmmExprRep src
  
@@ -157,9 +160,14 @@ data ChildCode64   -- a.k.a "Register64"
                         -- selection game are therefore that the returned
                         -- Reg may be modified
  
+#if WORD_SIZE_IN_BITS==32
  assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
  assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+#endif
+
+#ifndef x86_64_TARGET_ARCH
  iselExpr64        :: CmmExpr -> NatM ChildCode64
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -463,6 +471,21 @@ swizzleRegisterRep (Any _ codefn)     rep = Any rep codefn
  
  
  -- -----------------------------------------------------------------------------
+-- Utils based on getRegister, below
+
+-- The dual to getAnyReg: compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+       tmp <- getNewRegNat rep
+       return (tmp, code tmp)
+    Fixed _ reg code -> 
+       return (reg, code)
+
+-- -----------------------------------------------------------------------------
  -- Grab the Reg for a CmmReg
  
  getRegisterReg :: CmmReg -> Reg
@@ -489,17 +512,17 @@ getRegisterReg (CmmGlobal mid)
  
  getRegister :: CmmExpr -> NatM Register
  
+getRegister (CmmReg (CmmGlobal PicBaseReg))
+  = do
+      reg <- getPicBaseNat wordRep
+      return (Fixed wordRep reg nilOL)
+
  getRegister (CmmReg reg) 
    = return (Fixed (cmmRegRep reg) (getRegisterReg reg) nilOL)
  
  getRegister tree@(CmmRegOff _ _) 
    = getRegister (mangleIndexTree tree)
  
-getRegister CmmPicBaseReg
-  = do
-      reg <- getPicBaseNat wordRep
-      return (Fixed wordRep reg nilOL)
-
  -- end of machine-"independent" bit; here we go on the rest...
  
  #if alpha_TARGET_ARCH
@@ -769,6 +792,30 @@ getRegister (CmmLit (CmmFloat d F64))
      -- in
      return (Any F64 code)
  
+#endif /* i386_TARGET_ARCH */
+
+#if x86_64_TARGET_ARCH
+
+getRegister (CmmLit (CmmFloat 0.0 rep)) = do
+   let code dst = unitOL  (XOR rep (OpReg dst) (OpReg dst))
+       -- I don't know why there are xorpd, xorps, and pxor instructions.
+       -- They all appear to do the same thing --SDM
+   return (Any rep code)
+
+getRegister (CmmLit (CmmFloat f rep)) = do
+    lbl <- getNewLabelNat
+    let code dst = toOL [
+           LDATA ReadOnlyData
+                       [CmmDataLabel lbl,
+                        CmmStaticLit (CmmFloat f rep)],
+           MOV rep (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+           ]
+    -- in
+    return (Any rep code)
+
+#endif /* x86_64_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- catch simple cases of zero- or sign-extended load
  getRegister (CmmMachOp (MO_U_Conv I8 I32) [CmmLoad addr _]) = do
@@ -787,11 +834,87 @@ getRegister (CmmMachOp (MO_S_Conv I16 I32) [CmmLoad addr _]) = do
    code <- intLoadCode (MOVSxL I16) addr
    return (Any I32 code)
  
+#endif
+
+#if x86_64_TARGET_ARCH
+
+-- catch simple cases of zero- or sign-extended load
+getRegister (CmmMachOp (MO_U_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOV I32) addr -- 32-bit loads zero-extend
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I32) addr
+  return (Any I64 code)
+
+#endif
+
+#if x86_64_TARGET_ARCH
+getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
+  x_code <- getAnyReg x
+  lbl <- getNewLabelNat
+  let
+    code dst = x_code dst `appOL` toOL [
+       -- This is how gcc does it, so it can't be that bad:
+       LDATA ReadOnlyData16 [
+               CmmAlign 16,
+               CmmDataLabel lbl,
+               CmmStaticLit (CmmInt 0x80000000 I32),
+               CmmStaticLit (CmmInt 0 I32),
+               CmmStaticLit (CmmInt 0 I32),
+               CmmStaticLit (CmmInt 0 I32)
+       ],
+       XOR F32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+               -- xorps, so we need the 128-bit constant
+               -- ToDo: rip-relative
+       ]
+  --
+  return (Any F32 code)
+
+getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do
+  x_code <- getAnyReg x
+  lbl <- getNewLabelNat
+  let
+       -- This is how gcc does it, so it can't be that bad:
+    code dst = x_code dst `appOL` toOL [
+       LDATA ReadOnlyData16 [
+               CmmAlign 16,
+               CmmDataLabel lbl,
+               CmmStaticLit (CmmInt 0x8000000000000000 I64),
+               CmmStaticLit (CmmInt 0 I64)
+       ],
+               -- gcc puts an unpck here.  Wonder if we need it.
+       XOR F64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
+               -- xorpd, so we need the 128-bit constant
+       ]
+  --
+  return (Any F64 code)
+#endif
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  getRegister (CmmMachOp mop [x]) -- unary MachOps
    = case mop of
+#if i386_TARGET_ARCH
        MO_S_Neg F32 -> trivialUFCode F32 (GNEG F32) x
        MO_S_Neg F64 -> trivialUFCode F64 (GNEG F64) x
+#endif
  
        MO_S_Neg rep -> trivialUCode rep (NEGI rep) x
        MO_Not rep   -> trivialUCode rep (NOT  rep) x
@@ -805,6 +928,15 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
        MO_S_Conv I16 I8  -> conversionNop I16 x
        MO_U_Conv I32 I16 -> conversionNop I32 x
        MO_S_Conv I32 I16 -> conversionNop I32 x
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I64 I32 -> conversionNop I64 x
+      MO_S_Conv I64 I32 -> conversionNop I64 x
+      MO_U_Conv I64 I16 -> conversionNop I64 x
+      MO_S_Conv I64 I16 -> conversionNop I64 x
+      MO_U_Conv I64 I8  -> conversionNop I64 x
+      MO_S_Conv I64 I8  -> conversionNop I64 x
+#endif
+
        MO_U_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
        MO_S_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
  
@@ -817,12 +949,32 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
        MO_S_Conv I16 I32 -> integerExtend I16 I32 MOVSxL x
        MO_S_Conv I8  I16 -> integerExtend I8  I16 MOVSxL x
  
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I8  I64 -> integerExtend I8  I64 MOVZxL x
+      MO_U_Conv I16 I64 -> integerExtend I16 I64 MOVZxL x
+      MO_U_Conv I32 I64 -> integerExtend I32 I64 MOVZxL x
+      MO_S_Conv I8  I64 -> integerExtend I8  I64 MOVSxL x
+      MO_S_Conv I16 I64 -> integerExtend I16 I64 MOVSxL x
+      MO_S_Conv I32 I64 -> integerExtend I32 I64 MOVSxL x
+       -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+       -- However, we don't want the register allocator to throw it
+       -- away as an unnecessary reg-to-reg move, so we keep it in
+       -- the form of a movzl and print it as a movl later.
+#endif
+
+#if i386_TARGET_ARCH
        MO_S_Conv F32 F64 -> conversionNop F64 x
        MO_S_Conv F64 F32 -> conversionNop F32 x
+#else
+      MO_S_Conv F32 F64 -> coerceFP2FP F64 x
+      MO_S_Conv F64 F32 -> coerceFP2FP F32 x
+#endif
+
        MO_S_Conv from to
         | isFloatingRep from -> coerceFP2Int from to x
         | isFloatingRep to   -> coerceInt2FP from to x
  
+      other -> pprPanic "getRegister" (pprMachOp mop)
     where
         -- signed or unsigned extension.
         integerExtend from to instr expr = do
@@ -869,14 +1021,27 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
        MO_U_Lt rep -> condIntReg LU  x y
        MO_U_Le rep -> condIntReg LEU x y
  
-      MO_Add F32 -> trivialFCode F32  GADD x y
-      MO_Sub F32 -> trivialFCode F32  GSUB x y
+#if i386_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 GADD x y
+      MO_Sub F32 -> trivialFCode F32 GSUB x y
  
        MO_Add F64 -> trivialFCode F64 GADD x y
        MO_Sub F64 -> trivialFCode F64 GSUB x y
  
-      MO_S_Quot F32 -> trivialFCode  F32  GDIV x y
+      MO_S_Quot F32 -> trivialFCode F32 GDIV x y
        MO_S_Quot F64 -> trivialFCode F64 GDIV x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 ADD x y
+      MO_Sub F32 -> trivialFCode F32 SUB x y
+
+      MO_Add F64 -> trivialFCode F64 ADD x y
+      MO_Sub F64 -> trivialFCode F64 SUB x y
+
+      MO_S_Quot F32 -> trivialFCode F32 FDIV x y
+      MO_S_Quot F64 -> trivialFCode F64 FDIV x y
+#endif
  
        MO_Add rep -> add_code rep x y
        MO_Sub rep -> sub_code rep x y
@@ -886,8 +1051,16 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
        MO_U_Quot rep -> div_code rep False True  x y
        MO_U_Rem  rep -> div_code rep False False x y
  
+#if i386_TARGET_ARCH
        MO_Mul   F32 -> trivialFCode F32 GMUL x y
        MO_Mul   F64 -> trivialFCode F64 GMUL x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Mul   F32 -> trivialFCode F32 MUL x y
+      MO_Mul   F64 -> trivialFCode F64 MUL x y
+#endif
+
        MO_Mul   rep -> let op = IMUL rep in 
                       trivialCode rep op (Just op) x y
  
@@ -912,24 +1085,26 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
    where
      --------------------
      imulMayOflo :: MachRep -> CmmExpr -> CmmExpr -> NatM Register
-    imulMayOflo I32 a b = do
-         res_lo <- getNewRegNat I32
-         res_hi <- getNewRegNat I32
+    imulMayOflo rep a b = do
           (a_reg, a_code) <- getNonClobberedReg a
-         (b_reg, b_code) <- getSomeReg   b
+         b_code <- getAnyReg b
           let 
-             code dst = a_code `appOL` b_code `appOL`
+            shift_amt  = case rep of
+                          I32 -> 31
+                          I64 -> 63
+                          _ -> panic "shift_amt"
+
+             code = a_code `appOL` b_code eax `appOL`
                          toOL [
-                           MOV I32 (OpReg a_reg) (OpReg res_hi),
-                           MOV I32 (OpReg b_reg) (OpReg res_lo),
-                           IMUL64 res_hi res_lo,               -- result in res_hi:res_lo
-                           SAR I32 (OpImm (ImmInt 31)) (OpReg res_lo), -- sign extend lower part
-                           SUB I32 (OpReg res_hi) (OpReg res_lo),      -- compare against upper
-                           MOV I32 (OpReg res_lo) (OpReg dst)
-                           -- dst==0 if high part == sign extended low part
+                          IMUL2 rep (OpReg a_reg),   -- result in %edx:%eax
+                           SAR rep (OpImm (ImmInt shift_amt)) (OpReg eax),
+                               -- sign extend lower part
+                           SUB rep (OpReg edx) (OpReg eax)
+                               -- compare against upper
+                           -- eax==0 if high part == sign extended low part
                          ]
           -- in
-        return (Any I32 code)
+        return (Fixed rep eax code)
  
      --------------------
      shift_code :: MachRep
@@ -977,17 +1152,17 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
             code dst
                 = x_code `snocOL`
                  LEA rep
-                       (OpAddr (AddrBaseIndex (Just x_reg) Nothing imm))
+                       (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
                          (OpReg dst)
         -- 
         return (Any rep code)
  
      ----------------------
      div_code rep signed quotient x y = do
-          (y_op, y_code) <- getOperand y -- cannot be clobbered
+          (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
            x_code <- getAnyReg x
            let
-            widen | signed    = CLTD
+            widen | signed    = CLTD rep
                    | otherwise = XOR rep (OpReg edx) (OpReg edx)
  
              instr | signed    = IDIV
@@ -1004,17 +1179,18 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
             return (Fixed rep result code)
  
  
-
  getRegister (CmmLoad mem pk)
    | isFloatingRep pk
    = do
      Amode src mem_code <- getAmode mem
      let
         code dst = mem_code `snocOL` 
-                  GLD pk src dst
+                  IF_ARCH_i386(GLD pk src dst,
+                               MOV pk (OpAddr src) (OpReg dst))
      --
      return (Any pk code)
  
+#if i386_TARGET_ARCH
  getRegister (CmmLoad mem pk)
    | pk /= I64
    = do 
@@ -1029,14 +1205,47 @@ getRegister (CmmLoad mem pk)
         -- we can't guarantee access to an 8-bit variant of every register
         -- (esi and edi don't have 8-bit variants), so to make things
         -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+#endif
+
+#if x86_64_TARGET_ARCH
+-- Simpler memory load code on x86_64
+getRegister (CmmLoad mem pk)
+  = do 
+    code <- intLoadCode (MOV pk) mem
+    return (Any pk code)
+#endif
  
  getRegister (CmmLit (CmmInt 0 rep))
    = let
+       -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
+       adj_rep = case rep of I64 -> I32; _ -> rep
+       rep1 = IF_ARCH_i386( rep, adj_rep ) 
         code dst 
-           = unitOL (XOR rep (OpReg dst) (OpReg dst))
+           = unitOL (XOR rep1 (OpReg dst) (OpReg dst))
      in
         return (Any rep code)
  
+#if x86_64_TARGET_ARCH
+  -- optimisation for loading small literals on x86_64: take advantage
+  -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+  -- instruction forms are shorter.
+getRegister (CmmLit lit) 
+  | I64 <- cmmLitRep lit, not (isBigLit lit)
+  = let 
+       imm = litToImm lit
+       code dst = unitOL (MOV I32 (OpImm imm) (OpReg dst))
+    in
+       return (Any I64 code)
+  where
+   isBigLit (CmmInt i I64) = i < 0 || i > 0xffffffff
+   isBigLit _ = False
+       -- note1: not the same as is64BitLit, because that checks for
+       -- signed literals that fit in 32 bits, but we want unsigned
+       -- literals here.
+       -- note2: all labels are small, because we're assuming the
+       -- small memory model (see gcc docs, -mcmodel=small).
+#endif
+
  getRegister (CmmLit lit)
    = let 
         rep = cmmLitRep lit
@@ -1045,7 +1254,7 @@ getRegister (CmmLit lit)
      in
         return (Any rep code)
  
-getRegister other = panic "getRegister(x86)"
+getRegister other = pprPanic "getRegister(x86)" (ppr other)
  
  
  intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
@@ -1065,22 +1274,13 @@ anyReg :: Register -> NatM (Reg -> InstrBlock)
  anyReg (Any _ code)          = return code
  anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
  
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-       tmp <- getNewRegNat rep
-       return (tmp, code tmp)
-    Fixed _ reg code -> 
-       return (reg, code)
-
  -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
  -- Fixed registers might not be byte-addressable, so we make sure we've
  -- got a temporary, inserting an extra reg copy if necessary.
  getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+#if x86_64_TARGET_ARCH
+getByteReg = getSomeReg -- all regs are byte-addressable on x86_64
+#else
  getByteReg expr = do
    r <- getRegister expr
    case r of
@@ -1094,6 +1294,7 @@ getByteReg expr = do
             return (tmp, code `snocOL` reg2reg rep reg tmp)
         -- ToDo: could optimise slightly by checking for byte-addressable
         -- real registers, but that will happen very rarely if at all.
+#endif
  
  -- Another variant: this time we want the result in a register that cannot
  -- be modified by code to evaluate an arbitrary expression.
@@ -1114,10 +1315,12 @@ getNonClobberedReg expr = do
  
  reg2reg :: MachRep -> Reg -> Reg -> Instr
  reg2reg rep src dst 
+#if i386_TARGET_ARCH
    | isFloatingRep rep = GMOV src dst
+#endif
    | otherwise        = MOV rep (OpReg src) (OpReg dst)
  
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -1553,19 +1756,6 @@ extendSExpr rep x = CmmMachOp (MO_S_Conv rep I32) [x]
  extendUExpr I32 x = x
  extendUExpr rep x = CmmMachOp (MO_U_Conv rep I32) [x]
  
--- ###FIXME: exact code duplication from x86 case
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-       tmp <- getNewRegNat rep
-       return (tmp, code tmp)
-    Fixed _ reg code -> 
-       return (reg, code)
-
  #endif /* powerpc_TARGET_ARCH */
  
  
@@ -1639,21 +1829,23 @@ getAmode other
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- This is all just ridiculous, since it carefully undoes 
  -- what mangleIndexTree has just done.
-getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (-(fromInteger i))
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
    
-getAmode (CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (fromInteger i)
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
  
  -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be 
  -- recognised by the next rule.
@@ -1671,17 +1863,17 @@ getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _)
         let
            code = x_code `appOL` y_code
             base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
-       return (Amode (AddrBaseIndex (Just x_reg) (Just (y_reg,base)) (ImmInt 0))
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt 0))
                 code)
  
-getAmode (CmmLit lit)
+getAmode (CmmLit lit) | not (is64BitLit lit)
    = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
  
  getAmode expr = do
    (reg,code) <- getSomeReg expr
-  return (Amode (AddrBaseIndex (Just reg) Nothing (ImmInt 0)) code)
+  return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
  
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -1795,29 +1987,38 @@ getAmode other
  -- -----------------------------------------------------------------------------
  -- getOperand: sometimes any operand will do.
  
--- getOperand gets a *safe* operand; that is, the value of the operand
--- will remain valid across the computation of an arbitrary expression,
--- unless the expression is computed directly into a register which
--- the operand refers to (see trivialCode where this function is used
--- for an example).
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
  
-#ifdef i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
-getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-getOperand (CmmLoad mem pk) 
-  | not (isFloatingRep pk) && pk /= I64 = do
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
+getNonClobberedOperand (CmmLit lit)
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
+getNonClobberedOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
+    return (OpImm (litToImm lit), nilOL)
+getNonClobberedOperand (CmmLoad mem pk) 
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
      Amode src mem_code <- getAmode mem
      (src',save_code) <- 
         if (amodeCouldBeClobbered src) 
                 then do
                    tmp <- getNewRegNat wordRep
-                  return (AddrBaseIndex (Just tmp) Nothing (ImmInt 0),
+                  return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
                            unitOL (LEA I32 (OpAddr src) (OpReg tmp)))
                 else
                    return (src, nilOL)
      return (OpAddr src', save_code `appOL` mem_code)
-
-getOperand e = do
+getNonClobberedOperand e = do
      (reg, code) <- getNonClobberedReg e
      return (OpReg reg, code)
  
@@ -1827,6 +2028,56 @@ amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode)
  regClobbered (RealReg rr) = isFastTrue (freeReg rr)
  regClobbered _ = False
  
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
+getOperand (CmmLit lit)
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
+getOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) = do
+    return (OpImm (litToImm lit), nilOL)
+getOperand (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getOperand e = do
+    (reg, code) <- getSomeReg e
+    return (OpReg reg, code)
+
+isOperand :: CmmExpr -> Bool
+isOperand (CmmLoad _ _) = True
+isOperand (CmmLit lit)  = not (is64BitLit lit)
+                         || isSuitableFloatingPointLit lit
+isOperand _             = False
+
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory.  However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getRegOrMem e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+#if x86_64_TARGET_ARCH
+is64BitLit (CmmInt i I64) = i > 0x7fffffff || i < -0x80000000
+   -- assume that labels are in the range 0-2^31-1: this assumes the
+   -- small memory model (see gcc docs, -mcmodel=small).
+#endif
+is64BitLit x = False
  #endif
  
  -- -----------------------------------------------------------------------------
@@ -1846,7 +2097,7 @@ getCondCode = panic "MachCode.getCondCode: not on Alphas"
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH || sparc_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH || sparc_TARGET_ARCH
  -- yes, they really do seem to want exactly the same!
  
  getCondCode (CmmMachOp mop [x, y])
@@ -1940,10 +2191,10 @@ condFltCode = panic "MachCode.condFltCode: not on Alphas"
  #endif /* alpha_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- memory vs immediate
-condIntCode cond (CmmLoad x pk) (CmmLit lit) = do
+condIntCode cond (CmmLoad x pk) (CmmLit lit) | not (is64BitLit lit) = do
      Amode x_addr x_code <- getAmode x
      let
         imm  = litToImm lit
@@ -1961,50 +2212,29 @@ condIntCode cond x (CmmLit (CmmInt 0 pk)) = do
      --
      return (CondCode False cond code)
  
--- anything vs immediate
-condIntCode cond x (CmmLit lit) = do
-    (x_reg, x_code) <- getSomeReg x
+-- anything vs operand
+condIntCode cond x y | isOperand y = do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op,  y_code) <- getOperand y    
      let
-       imm  = litToImm lit
-       code = x_code `snocOL`
-                  CMP (cmmLitRep lit) (OpImm imm) (OpReg x_reg)
-    -- in
-    return (CondCode False cond code)
-
--- memory vs anything
-condIntCode cond (CmmLoad x pk) y = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-       code = y_code `appOL`
-               x_code `snocOL`
-                 CMP pk (OpReg y_reg) (OpAddr x_addr)
-    -- in
-    return (CondCode False cond code)
-
--- anything vs memory
-condIntCode cond y (CmmLoad x pk) = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-       code = y_code `appOL`
-               x_code `snocOL`
-                 CMP pk (OpAddr x_addr) (OpReg y_reg)
+       code = x_code `appOL` y_code `snocOL`
+                  CMP (cmmExprRep x) y_op (OpReg x_reg)
      -- in
      return (CondCode False cond code)
  
  -- anything vs anything
  condIntCode cond x y = do
-  (x_op, x_code) <- getOperand x
-  (y_reg, y_code) <- getSomeReg y
+  (y_reg, y_code) <- getNonClobberedReg y
+  (x_op, x_code) <- getRegOrMem x
    let
-       code = x_code `appOL`
-              y_code `snocOL`
+       code = y_code `appOL`
+              x_code `snocOL`
                   CMP (cmmExprRep x) (OpReg y_reg) x_op
    -- in
    return (CondCode False cond code)
+#endif
  
------------
+#if i386_TARGET_ARCH
  condFltCode cond x y 
    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
    (x_reg, x_code) <- getNonClobberedReg x
@@ -2015,9 +2245,25 @@ condFltCode cond x y
    -- The GCMP insn does the test and sets the zero flag if comparable
    -- and true.  Hence we always supply EQQ as the condition to test.
    return (CondCode True EQQ code)
-
  #endif /* i386_TARGET_ARCH */
  
+#if x86_64_TARGET_ARCH
+-- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+-- an operand, but the right must be a reg.  We can probably do better
+-- than this general case...
+condFltCode cond x y = do
+  (x_reg, x_code) <- getNonClobberedReg x
+  (y_op, y_code) <- getOperand y
+  let
+       code = x_code `appOL`
+              y_code `snocOL`
+                 CMP (cmmExprRep x) y_op (OpReg x_reg)
+       -- NB(1): we need to use the unsigned comparison operators on the
+       -- result of this comparison.
+  -- in
+  return (CondCode True (condToUnsigned cond) code)
+#endif
+
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if sparc_TARGET_ARCH
@@ -2085,7 +2331,7 @@ condFltCode cond x y
  #endif /* sparc_TARGET_ARCH */
  
  #if powerpc_TARGET_ARCH
--- ###FIXME: I16 and I8!
+--  ###FIXME: I16 and I8!
  condIntCode cond x (CmmLit (CmmInt y rep))
    | Just src2 <- makeImmediate rep (not $ condUnsigned cond) y
    = do
@@ -2171,7 +2417,7 @@ assignIntCode pk dst src
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- integer assignment to memory
  assignMem_IntCode pk addr src = do
@@ -2189,7 +2435,7 @@ assignMem_IntCode pk addr src = do
      return code
    where
      get_op_RI :: CmmExpr -> NatM (InstrBlock,Operand)  -- code, operator
-    get_op_RI (CmmLit lit)
+    get_op_RI (CmmLit lit) | not (is64BitLit lit)
        = return (nilOL, OpImm (litToImm lit))
      get_op_RI op
        = do (reg,code) <- getNonClobberedReg op
@@ -2298,7 +2544,7 @@ assignFltCode pk dst src
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- Floating point assignment to memory
  assignMem_FltCode pk addr src = do
@@ -2307,7 +2553,8 @@ assignMem_FltCode pk addr src = do
    let
         code = src_code `appOL`
                addr_code `snocOL`
-                GST pk src_reg addr
+                IF_ARCH_i386(GST pk src_reg addr,
+                            MOV pk (OpReg src_reg) (OpAddr addr))
    return code
  
  -- Floating point assignment to a register/temporary
@@ -2416,7 +2663,7 @@ genJump tree
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  genJump (CmmLoad mem pk) = do
    Amode target code <- getAmode mem
@@ -2471,7 +2718,7 @@ genBranch :: BlockId -> NatM InstrBlock
  genBranch id = return (unitOL (BR id))
  #endif
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  genBranch id = return (unitOL (JXX ALWAYS id))
  #endif
  
@@ -2667,8 +2914,45 @@ genCondJump id bool = do
    CondCode _ cond code <- getCondCode bool
    return (code `snocOL` JXX cond id)
  
-#endif /* i386_TARGET_ARCH */
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
+#if x86_64_TARGET_ARCH
+
+genCondJump id bool = do
+  CondCode is_float cond cond_code <- getCondCode bool
+  if not is_float
+    then
+       return (cond_code `snocOL` JXX cond id)
+    else do
+       lbl <- getBlockIdNat
+
+       -- see comment with condFltReg
+       let code = case cond of
+                       NE  -> or_unordered
+                       GU  -> plain_test
+                       GEU -> plain_test
+                       _   -> and_ordered
+
+           plain_test = unitOL (
+                 JXX cond id
+               )
+           or_unordered = toOL [
+                 JXX cond id,
+                 JXX PARITY id
+               ]
+           and_ordered = toOL [
+                 JXX PARITY lbl,
+                 JXX cond id,
+                 JXX ALWAYS lbl,
+                 NEWBLOCK lbl
+               ]
+       return (cond_code `appOL` code)
+
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if sparc_TARGET_ARCH
  
@@ -2823,19 +3107,19 @@ genCCall target dest_regs args vols = do
         -- CmmPrim -> ...
          CmmForeignCall (CmmLit (CmmLabel lbl)) conv
             -> -- ToDo: stdcall arg sizes
-             return (unitOL (CALL (Left fn_imm)), conv)
+             return (unitOL (CALL (Left fn_imm) []), conv)
            where fn_imm = ImmCLbl lbl
          CmmForeignCall expr conv
             -> do (dyn_c, dyn_r, dyn_rep) <- get_op expr
                   ASSERT(dyn_rep == I32)
-                  return (dyn_c `snocOL` CALL (Right dyn_r), conv)
+                  return (dyn_c `snocOL` CALL (Right dyn_r) [], conv)
  
      let        push_code = concatOL push_codes
         call = callinsns `appOL`
                 toOL (
                         -- Deallocate parameters after call for ccall;
                         -- but not for stdcall (callee does it)
-                  (if cconv == StdCallConv then [] else 
+                  (if cconv == StdCallConv || tot_arg_size==0 then [] else 
                    [ADD I32 (OpImm (ImmInt tot_arg_size)) (OpReg esp)])
                    ++
                    [DELTA (delta + tot_arg_size)]
@@ -2895,8 +3179,8 @@ genCCall target dest_regs args vols = do
                          code `appOL`
                          toOL [SUB I32 (OpImm (ImmInt size)) (OpReg esp),
                                DELTA (delta-size),
-                              GST sz reg (AddrBaseIndex (Just esp) 
-                                                        Nothing 
+                              GST sz reg (AddrBaseIndex (EABaseReg esp) 
+                                                        EAIndexNone
                                                          (ImmInt 0))]
                         )
             else return (size,
@@ -2913,6 +3197,9 @@ genCCall target dest_regs args vols = do
          (reg,code) <- getSomeReg op
         return (code, reg, cmmExprRep op)
  
+#endif /* i386_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  outOfLineFloatOp :: CallishMachOp -> CmmReg -> [(CmmExpr,MachHint)]
    -> Maybe [GlobalReg] -> NatM InstrBlock
@@ -2936,6 +3223,10 @@ outOfLineFloatOp mop res args vols
         lbl = CmmLabel (mkForeignLabel fn Nothing False)
  
         fn = case mop of
+             MO_F32_Sqrt  -> FSLIT("sqrt")
+             MO_F32_Sin   -> FSLIT("sin")
+             MO_F32_Cos   -> FSLIT("cos")
+             MO_F32_Tan   -> FSLIT("tan")
               MO_F32_Exp   -> FSLIT("exp")
               MO_F32_Log   -> FSLIT("log")
  
@@ -2948,6 +3239,10 @@ outOfLineFloatOp mop res args vols
               MO_F32_Tanh  -> FSLIT("tanh")
               MO_F32_Pwr   -> FSLIT("pow")
  
+             MO_F64_Sqrt  -> FSLIT("sqrt")
+             MO_F64_Sin   -> FSLIT("sin")
+             MO_F64_Cos   -> FSLIT("cos")
+             MO_F64_Tan   -> FSLIT("tan")
               MO_F64_Exp   -> FSLIT("exp")
               MO_F64_Log   -> FSLIT("log")
  
@@ -2960,9 +3255,167 @@ outOfLineFloatOp mop res args vols
               MO_F64_Tanh  -> FSLIT("tanh")
               MO_F64_Pwr   -> FSLIT("pow")
  
-              other -> pprPanic "outOfLineFloatOp" (pprCallishMachOp mop)
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
-#endif /* i386_TARGET_ARCH */
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if x86_64_TARGET_ARCH
+
+genCCall (CmmPrim op) [(r,_)] args vols = 
+  outOfLineFloatOp op r args vols
+
+genCCall target dest_regs args vols = do
+
+       -- load up the register arguments
+    (stack_args, aregs, fregs, load_args_code)
+        <- load_args args allArgRegs allFPArgRegs nilOL
+
+    let
+       fp_regs_used  = reverse (drop (length fregs) (reverse allFPArgRegs))
+       int_regs_used = reverse (drop (length aregs) (reverse allArgRegs))
+       arg_regs = int_regs_used ++ fp_regs_used
+               -- for annotating the call instruction with
+
+       sse_regs = length fp_regs_used
+
+       tot_arg_size = arg_size * length stack_args
+
+       -- On entry to the called function, %rsp should be aligned
+       -- on a 16-byte boundary +8 (i.e. the first stack arg after
+       -- the return address is 16-byte aligned).  In STG land
+       -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
+       -- need to make sure we push a multiple of 16-bytes of args,
+       -- plus the return address, to get the correct alignment.
+       -- Urg, this is hard.  We need to feed the delta back into
+       -- the arg pushing code.
+    (real_size, adjust_rsp) <-
+       if tot_arg_size `rem` 16 == 0
+           then return (tot_arg_size, nilOL)
+           else do -- we need to adjust...
+               delta <- getDeltaNat
+               setDeltaNat (delta-8)
+               return (tot_arg_size+8, toOL [
+                               SUB I64 (OpImm (ImmInt 8)) (OpReg rsp),
+                               DELTA (delta-8)
+                       ])
+
+       -- push the stack args, right to left
+    push_code <- push_args (reverse stack_args) nilOL
+    delta <- getDeltaNat
+
+    -- deal with static vs dynamic call targets
+    (callinsns,cconv) <-
+      case target of
+       -- CmmPrim -> ...
+        CmmForeignCall (CmmLit (CmmLabel lbl)) conv
+           -> -- ToDo: stdcall arg sizes
+             return (unitOL (CALL (Left fn_imm) arg_regs), conv)
+          where fn_imm = ImmCLbl lbl
+        CmmForeignCall expr conv
+           -> do (dyn_r, dyn_c) <- getSomeReg expr
+                return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+
+    let
+       -- The x86_64 ABI requires us to set %al to the number of SSE
+       -- registers that contain arguments, if the called routine
+       -- is a varargs function.  We don't know whether it's a
+       -- varargs function or not, so we have to assume it is.
+       --
+       -- It's not safe to omit this assignment, even if the number
+       -- of SSE regs in use is zero.  If %al is larger than 8
+       -- on entry to a varargs function, seg faults ensue.
+       assign_eax n = unitOL (MOV I32 (OpImm (ImmInt n)) (OpReg eax))
+
+    let call = callinsns `appOL`
+               toOL (
+                       -- Deallocate parameters after call for ccall;
+                       -- but not for stdcall (callee does it)
+                  (if cconv == StdCallConv || real_size==0 then [] else 
+                  [ADD wordRep (OpImm (ImmInt real_size)) (OpReg esp)])
+                  ++
+                  [DELTA (delta + real_size)]
+               )
+    -- in
+    setDeltaNat (delta + real_size)
+
+    let
+       -- assign the results, if necessary
+       assign_code []     = nilOL
+       assign_code [(dest,_hint)] = 
+         case rep of
+               F32 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+               F64 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+               rep -> unitOL (MOV rep (OpReg rax) (OpReg r_dest))
+         where 
+               rep = cmmRegRep dest
+               r_dest = getRegisterReg dest
+       assign_code many = panic "genCCall.assign_code many"
+
+    return (load_args_code     `appOL` 
+           adjust_rsp          `appOL`
+           push_code           `appOL`
+           assign_eax sse_regs `appOL`
+           call                `appOL` 
+           assign_code dest_regs)
+
+  where
+    arg_size = 8 -- always, at the mo
+
+    load_args :: [(CmmExpr,MachHint)]
+             -> [Reg]                  -- int regs avail for args
+             -> [Reg]                  -- FP regs avail for args
+             -> InstrBlock
+             -> NatM ([(CmmExpr,MachHint)],[Reg],[Reg],InstrBlock)
+    load_args args [] [] code     =  return (args, [], [], code)
+       -- no more regs to use
+    load_args [] aregs fregs code =  return ([], aregs, fregs, code)
+       -- no more args to push
+    load_args ((arg,hint) : rest) aregs fregs code
+       | isFloatingRep arg_rep = 
+       case fregs of
+         [] -> push_this_arg
+         (r:rs) -> do
+            arg_code <- getAnyReg arg
+            load_args rest aregs rs (code `appOL` arg_code r)
+       | otherwise =
+       case aregs of
+         [] -> push_this_arg
+         (r:rs) -> do
+            arg_code <- getAnyReg arg
+            load_args rest rs fregs (code `appOL` arg_code r)
+       where
+         arg_rep = cmmExprRep arg
+
+         push_this_arg = do
+           (args',ars,frs,code') <- load_args rest aregs fregs code
+           return ((arg,hint):args', ars, frs, code')
+
+    push_args [] code = return code
+    push_args ((arg,hint):rest) code
+       | isFloatingRep arg_rep = do
+        (arg_reg, arg_code) <- getSomeReg arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+        let code' = code `appOL` toOL [
+                       MOV arg_rep (OpReg arg_reg) (OpAddr  (spRel 0)),
+                       SUB wordRep (OpImm (ImmInt arg_size)) (OpReg rsp) ,
+                       DELTA (delta-arg_size)]
+        push_args rest code'
+
+       | otherwise = do
+       -- we only ever generate word-sized function arguments.  Promotion
+       -- has already happened: our Int8# type is kept sign-extended
+       -- in an Int#, for example.
+        ASSERT(arg_rep == I64) return ()
+        (arg_op, arg_code) <- getOperand arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+        let code' = code `appOL` toOL [PUSH I64 arg_op, 
+                                       DELTA (delta-arg_size)]
+        push_args rest code'
+       where
+         arg_rep = cmmExprRep arg
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -3365,13 +3818,13 @@ genCCall target dest_regs argsAndHints vols
  
  genSwitch :: CmmExpr -> [Maybe BlockId] -> NatM InstrBlock
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  genSwitch expr ids = do
    (reg,e_code) <- getSomeReg expr
    lbl <- getNewLabelNat
    let
         jumpTable = map jumpTableEntry ids
-       op = OpAddr (AddrBaseIndex Nothing (Just (reg,4)) (ImmCLbl lbl))
+       op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
         code = e_code `appOL` toOL [
                 LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                 JMP_TBL op [ id | Just id <- ids ]
@@ -3388,12 +3841,19 @@ genSwitch expr ids
          dynRef <- cmmMakeDynamicReference addImportNat False lbl
          (tableReg,t_code) <- getSomeReg $ dynRef
          let
-            jumpTable = map jumpTableEntry ids
-        
+            jumpTable = map jumpTableEntryRel ids
+            
+            jumpTableEntryRel Nothing
+                = CmmStaticLit (CmmInt 0 wordRep)
+            jumpTableEntryRel (Just (BlockId id))
+                = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
+                where blockLabel = mkAsmTempLabel id
+
              code = e_code `appOL` t_code `appOL` toOL [
                              LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                              SLW tmp reg (RIImm (ImmInt 2)),
                              LD I32 tmp (AddrRegReg tableReg tmp),
+                            ADD tmp tmp (RIReg tableReg),
                              MTCTR tmp,
                              BCTR [ id | Just id <- ids ]
                      ]
@@ -3448,7 +3908,7 @@ condFltReg = panic "MachCode.condFltReg (not on Alpha)"
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  condIntReg cond x y = do
    CondCode _ cond cond_code <- condIntCode cond x y
@@ -3456,38 +3916,77 @@ condIntReg cond x y = do
    let 
         code dst = cond_code `appOL` toOL [
                     SETCC cond (OpReg tmp),
-                   MOV I32 (OpReg tmp) (OpReg dst),
-                   AND I32 (OpImm (ImmInt 1)) (OpReg dst)
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
                   ]
-                  -- NB. (1) Tha AND is needed here because the x86 only
-                  -- sets the low byte in the SETCC instruction.
-                  -- NB. (2) The extra temporary register is a hack to
-                  -- work around the fact that the setcc instructions only
-                  -- accept byte registers.  dst might not be a byte-able reg,
-                  -- but currently all free registers are byte-able, so we're
-                  -- guaranteed that a new temporary is byte-able.
    -- in
    return (Any I32 code)
  
+#endif
+
+#if i386_TARGET_ARCH
  
  condFltReg cond x y = do
-  lbl1 <- getBlockIdNat
-  lbl2 <- getBlockIdNat
    CondCode _ cond cond_code <- condFltCode cond x y
-  let
-       code dst = cond_code `appOL` toOL [
-           JXX cond lbl1,
-           MOV I32 (OpImm (ImmInt 0)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl1,
-           MOV I32 (OpImm (ImmInt 1)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl2]
-               -- SIGH, have to split up this block somehow...
+  tmp <- getNewRegNat I8
+  let 
+       code dst = cond_code `appOL` toOL [
+                   SETCC cond (OpReg tmp),
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
+                 ]
    -- in
    return (Any I32 code)
  
-#endif /* i386_TARGET_ARCH */
+#endif
+
+#if x86_64_TARGET_ARCH
+
+condFltReg cond x y = do
+  CondCode _ cond cond_code <- condFltCode cond x y
+  tmp1 <- getNewRegNat wordRep
+  tmp2 <- getNewRegNat wordRep
+  let 
+       -- We have to worry about unordered operands (eg. comparisons
+       -- against NaN).  If the operands are unordered, the comparison
+       -- sets the parity flag, carry flag and zero flag.
+       -- All comparisons are supposed to return false for unordered
+       -- operands except for !=, which returns true.
+       --
+       -- Optimisation: we don't have to test the parity flag if we
+       -- know the test has already excluded the unordered case: eg >
+       -- and >= test for a zero carry flag, which can only occur for
+       -- ordered operands.
+       --
+       -- ToDo: by reversing comparisons we could avoid testing the
+       -- parity flag in more cases.
+
+       code dst = 
+          cond_code `appOL` 
+            (case cond of
+               NE  -> or_unordered dst
+               GU  -> plain_test   dst
+               GEU -> plain_test   dst
+               _   -> and_ordered  dst)
+
+       plain_test dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   MOVZxL I8 (OpReg tmp1) (OpReg dst)
+                ]
+       or_unordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC PARITY (OpReg tmp2),
+                   OR I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+       and_ordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC NOTPARITY (OpReg tmp2),
+                   AND I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+  -- in
+  return (Any I32 code)
+
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -3651,9 +4150,11 @@ trivialCode
      -> IF_ARCH_alpha((Reg -> RI -> Reg -> Instr)
        ,IF_ARCH_i386 ((Operand -> Operand -> Instr) 
                       -> Maybe (Operand -> Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Operand -> Instr) 
+                     -> Maybe (Operand -> Operand -> Instr)
        ,IF_ARCH_sparc((Reg -> RI -> Reg -> Instr)
        ,IF_ARCH_powerpc(Bool -> (Reg -> Reg -> RI -> Instr)
-      ,))))
+      ,)))))
      -> CmmExpr -> CmmExpr -- the two arguments
      -> NatM Register
  
@@ -3663,7 +4164,8 @@ trivialFCode
      -> IF_ARCH_alpha((Reg -> Reg -> Reg -> Instr)
        ,IF_ARCH_sparc((Size -> Reg -> Reg -> Reg -> Instr)
        ,IF_ARCH_i386 ((MachRep -> Reg -> Reg -> Reg -> Instr)
-      ,)))
+      ,IF_ARCH_x86_64 ((MachRep -> Operand -> Operand -> Instr)
+      ,))))
      -> CmmExpr -> CmmExpr -- the two arguments
      -> NatM Register
  #endif
@@ -3672,9 +4174,10 @@ trivialUCode
      :: MachRep 
      -> IF_ARCH_alpha((RI -> Reg -> Instr)
        ,IF_ARCH_i386 ((Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Instr)
        ,IF_ARCH_sparc((RI -> Reg -> Instr)
        ,IF_ARCH_powerpc((Reg -> Reg -> Instr)
-      ,))))
+      ,)))))
      -> CmmExpr -- the one argument
      -> NatM Register
  
@@ -3683,8 +4186,9 @@ trivialUFCode
      :: MachRep
      -> IF_ARCH_alpha((Reg -> Reg -> Instr)
        ,IF_ARCH_i386 ((Reg -> Reg -> Instr)
+      ,IF_ARCH_x86_64 ((Reg -> Reg -> Instr)
        ,IF_ARCH_sparc((Reg -> Reg -> Instr)
-      ,)))
+      ,))))
      -> CmmExpr -- the one argument
      -> NatM Register
  #endif
@@ -3763,7 +4267,7 @@ trivialUFCode _ instr x
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  {-
  The Rules of the Game are:
@@ -3810,16 +4314,8 @@ SDM's version of The Rules:
    register happens to be the destination register.
  -}
  
-trivialCode rep instr maybe_revinstr a (CmmLit lit_b) = do
-  a_code <- getAnyReg a
-  let
-       code dst
-          = a_code dst `snocOL` 
-           instr (OpImm (litToImm lit_b)) (OpReg dst)
-  -- in
-  return (Any rep code)
-              
-trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
+trivialCode rep instr (Just revinstr) (CmmLit lit_a) b
+  | not (is64BitLit lit_a) = do
    b_code <- getAnyReg b
    let
         code dst 
@@ -3828,8 +4324,11 @@ trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
    -- in
    return (Any rep code)
  
-trivialCode rep instr maybe_revinstr a b = do
-  (b_op, b_code) <- getOperand b
+trivialCode rep instr maybe_revinstr a b = genTrivialCode rep instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode rep instr a b = do
+  (b_op, b_code) <- getNonClobberedOperand b
    a_code <- getAnyReg a
    tmp <- getNewRegNat rep
    let
@@ -3840,7 +4339,7 @@ trivialCode rep instr maybe_revinstr a b = do
       -- as the destination reg.  In this case, we have to save b in a
       -- new temporary across the computation of a.
       code dst
-       | dst `clashesWith` b_op =
+       | dst `regClashesWithOp` b_op =
                 b_code `appOL`
                 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
                 a_code dst `snocOL`
@@ -3851,9 +4350,10 @@ trivialCode rep instr maybe_revinstr a b = do
                 instr b_op (OpReg dst)
    -- in
    return (Any rep code)
- where
-  reg `clashesWith` OpReg reg2   = reg == reg2
-  reg `clashesWith` OpAddr amode = any (==reg) (addrModeRegs amode)
+
+reg `regClashesWithOp` OpReg reg2   = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+reg `regClashesWithOp` _            = False
  
  -----------
  
@@ -3868,6 +4368,8 @@ trivialUCode rep instr x = do
  
  -----------
  
+#if i386_TARGET_ARCH
+
  trivialFCode pk instr x y = do
    (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
    (y_reg, y_code) <- getSomeReg y
@@ -3879,6 +4381,14 @@ trivialFCode pk instr x y = do
    -- in
    return (Any pk code)
  
+#endif
+
+#if x86_64_TARGET_ARCH
+
+trivialFCode pk instr x y = genTrivialCode  pk (instr pk) x y
+
+#endif
+
  -------------
  
  trivialUFCode rep instr x = do
@@ -4069,7 +4579,7 @@ remainderCode rep div x y = do
  coerceInt2FP :: MachRep -> MachRep -> CmmExpr -> NatM Register
  coerceFP2Int :: MachRep -> MachRep -> CmmExpr -> NatM Register
  
-#ifdef sparc_TARGET_ARCH
+#if sparc_TARGET_ARCH
  coerceDbl2Flt :: CmmExpr -> NatM Register
  coerceFlt2Dbl :: CmmExpr -> NatM Register
  #endif
@@ -4137,6 +4647,37 @@ coerceFP2Int from to x = do
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
+#if x86_64_TARGET_ARCH
+
+coerceFP2Int from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case from of F32 -> CVTSS2SI; F64 -> CVTSD2SI
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceInt2FP from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case to of F32 -> CVTSI2SS; F64 -> CVTSI2SD
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceFP2FP :: MachRep -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+        opc  = case to of F32 -> CVTSD2SS; F64 -> CVTSS2SD
+        code dst = x_code `snocOL` opc x_reg dst
+  -- in
+  return (Any to code)
+
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
  #if sparc_TARGET_ARCH
  
  coerceInt2FP pk x