From 6c55401037f9ff01170e3979dca51f6b2a9a8293 Mon Sep 17 00:00:00 2001
From: simonmar <unknown>
Date: Fri, 1 Apr 2005 12:14:30 +0000
Subject: [PATCH] [project @ 2005-04-01 12:14:29 by simonmar] First cut at the
 x86_64 native code generator.  Lots of code is shared with
 i386, but floating point uses SSE2.

This more or less works, the things I know that don't work are:

  - the floating-point primitives (sin, cos etc.) are missing
  - floating-point comparisons involving NaN are wrong
  - there's no PIC support yet

Also, I have a long list of small things to fix up to improve
performance.

I think the small memory model is assumed, for now.
---
 ghc/compiler/cmm/Cmm.hs                |    1 +
 ghc/compiler/nativeGen/AsmCodeGen.lhs  |   37 +-
 ghc/compiler/nativeGen/MachCodeGen.hs  |  652 ++++++++++++++++++++++++++------
 ghc/compiler/nativeGen/MachInstrs.hs   |  106 +++---
 ghc/compiler/nativeGen/MachRegs.lhs    |  160 +++++++-
 ghc/compiler/nativeGen/NCG.h           |    6 +
 ghc/compiler/nativeGen/PprMach.hs      |  284 +++++++++++---
 ghc/compiler/nativeGen/RegAllocInfo.hs |   62 ++-
 8 files changed, 1050 insertions(+), 258 deletions(-)
diff --git a/ghc/compiler/cmm/Cmm.hs b/ghc/compiler/cmm/Cmm.hs
index aa92e01..a8576ec 100644
--- a/ghc/compiler/cmm/Cmm.hs
+++ b/ghc/compiler/cmm/Cmm.hs
@@ -241,6 +241,7 @@ data Section
   | ReadOnlyData
   | RelocatableReadOnlyData
   | UninitialisedData
+  | ReadOnlyData16	-- .rodata.cst16 on x86_64, 16-byte aligned
   | OtherSection String
 
 data CmmStatic
diff --git a/ghc/compiler/nativeGen/AsmCodeGen.lhs b/ghc/compiler/nativeGen/AsmCodeGen.lhs
index e790991..2675a26 100644
--- a/ghc/compiler/nativeGen/AsmCodeGen.lhs
+++ b/ghc/compiler/nativeGen/AsmCodeGen.lhs
@@ -552,12 +552,37 @@ cmmMachOpFold op arg@[CmmLit (CmmInt x rep)]
 cmmMachOpFold (MO_S_Conv rep1 rep2) [x] | rep1 == rep2 = x
 cmmMachOpFold (MO_U_Conv rep1 rep2) [x] | rep1 == rep2 = x
 
--- ToDo: eliminate multiple conversions.  Be careful though: can't remove
--- a narrowing, and can't remove conversions to/from floating point types.
-
--- ToDo: eliminate nested comparisons:
---    CmmMachOp MO_Lt [CmmMachOp MO_Eq [x,y], CmmLit (CmmInt 0 _)]
--- turns into a simple equality test.
+-- Eliminate nested conversions where possible
+cmmMachOpFold conv_outer args@[CmmMachOp conv_inner [x]]
+  | Just (rep1,rep2,signed1) <- isIntConversion conv_inner,
+    Just (_,   rep3,signed2) <- isIntConversion conv_outer
+  = case () of
+	-- widen then narrow to the same size is a nop
+      _ | rep1 < rep2 && rep1 == rep3 -> x
+	-- Widen then narrow to different size: collapse to single conversion
+	-- but remember to use the signedness from the widening, just in case
+	-- the final conversion is a widen.
+	| rep1 < rep2 && rep2 > rep3 ->
+	    cmmMachOpFold (intconv signed1 rep1 rep3) [x]
+	-- Nested widenings: collapse if the signedness is the same
+	| rep1 < rep2 && rep2 < rep3 && signed1 == signed2 ->
+	    cmmMachOpFold (intconv signed1 rep1 rep3) [x]
+	-- Nested narrowings: collapse
+	| rep1 > rep2 && rep2 > rep3 ->
+	    cmmMachOpFold (MO_U_Conv rep1 rep3) [x]
+	| otherwise ->
+	    CmmMachOp conv_outer args
+  where
+	isIntConversion (MO_U_Conv rep1 rep2) = Just (rep1,rep2,False)
+	isIntConversion (MO_S_Conv rep1 rep2) = Just (rep1,rep2,True)
+	isIntConversion _ = Nothing
+ 
+	intconv True  = MO_S_Conv
+	intconv False = MO_U_Conv
+
+-- ToDo: a narrow of a load can be collapsed into a narrow load, right?
+-- but what if the architecture only supports word-sized loads, should
+-- we do the transformation anyway?
 
 cmmMachOpFold mop args@[CmmLit (CmmInt x xrep), CmmLit (CmmInt y _)]
   = case mop of
diff --git a/ghc/compiler/nativeGen/MachCodeGen.hs b/ghc/compiler/nativeGen/MachCodeGen.hs
index 35e0105..c93b678 100644
--- a/ghc/compiler/nativeGen/MachCodeGen.hs
+++ b/ghc/compiler/nativeGen/MachCodeGen.hs
@@ -37,6 +37,7 @@ import Outputable
 import qualified Outputable
 import FastString
 import FastTypes	( isFastTrue )
+import Constants	( wORD_SIZE )
 
 #ifdef DEBUG
 import Outputable	( assertPanic )
@@ -102,15 +103,17 @@ stmtToInstrs stmt = case stmt of
 
     CmmAssign reg src
       | isFloatingRep kind -> assignReg_FltCode kind reg src
-      | wordRep == I32 && kind == I64
-			   -> assignReg_I64Code      reg src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64 	   -> assignReg_I64Code      reg src
+#endif
       | otherwise	   -> assignReg_IntCode kind reg src
 	where kind = cmmRegRep reg
 
     CmmStore addr src
       | isFloatingRep kind -> assignMem_FltCode kind addr src
-      | wordRep == I32 && kind == I64
-        		 -> assignMem_I64Code      addr src
+#if WORD_SIZE_IN_BITS==32
+      | kind == I64 	 -> assignMem_I64Code      addr src
+#endif
       | otherwise	 -> assignMem_IntCode kind addr src
 	where kind = cmmExprRep src
 
@@ -157,9 +160,14 @@ data ChildCode64 	-- a.k.a "Register64"
 			-- selection game are therefore that the returned
 			-- Reg may be modified
 
+#if WORD_SIZE_IN_BITS==32
 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
 assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+#endif
+
+#ifndef x86_64_TARGET_ARCH
 iselExpr64        :: CmmExpr -> NatM ChildCode64
+#endif
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
@@ -463,6 +471,21 @@ swizzleRegisterRep (Any _ codefn)     rep = Any rep codefn
 
 
 -- -----------------------------------------------------------------------------
+-- Utils based on getRegister, below
+
+-- The dual to getAnyReg: compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+	tmp <- getNewRegNat rep
+	return (tmp, code tmp)
+    Fixed _ reg code -> 
+	return (reg, code)
+
+-- -----------------------------------------------------------------------------
 -- Grab the Reg for a CmmReg
 
 getRegisterReg :: CmmReg -> Reg
@@ -769,6 +792,31 @@ getRegister (CmmLit (CmmFloat d F64))
     -- in
     return (Any F64 code)
 
+#endif /* i386_TARGET_ARCH */
+
+#if x86_64_TARGET_ARCH
+
+getRegister (CmmLit (CmmFloat 0.0 rep)) = do
+   let code dst = unitOL  (XOR rep (OpReg dst) (OpReg dst))
+	-- I don't know why there are xorpd, xorps, and pxor instructions.
+	-- They all appear to do the same thing --SDM
+   return (Any rep code)
+
+getRegister (CmmLit (CmmFloat f rep)) = do
+    lbl <- getNewLabelNat
+    let code dst = toOL [
+	    LDATA ReadOnlyData
+			[CmmDataLabel lbl,
+			 CmmStaticLit (CmmFloat f rep)],
+	    MOV rep (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
+	-- ToDo: should use %rip-relative
+	    ]
+    -- in
+    return (Any rep code)
+
+#endif /* x86_64_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- catch simple cases of zero- or sign-extended load
 getRegister (CmmMachOp (MO_U_Conv I8 I32) [CmmLoad addr _]) = do
@@ -787,11 +835,86 @@ getRegister (CmmMachOp (MO_S_Conv I16 I32) [CmmLoad addr _]) = do
   code <- intLoadCode (MOVSxL I16) addr
   return (Any I32 code)
 
+#endif
+
+#if x86_64_TARGET_ARCH
+
+-- catch simple cases of zero- or sign-extended load
+getRegister (CmmMachOp (MO_U_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I8 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I8) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I16 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I16) addr
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_U_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOV I32) addr -- 32-bit loads zero-extend
+  return (Any I64 code)
+
+getRegister (CmmMachOp (MO_S_Conv I32 I64) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL I32) addr
+  return (Any I64 code)
+
+#endif
+
+#if x86_64_TARGET_ARCH
+getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
+  lbl <- getNewLabelNat
+  let
+    code dst = toOL [
+	-- This is how gcc does it, so it can't be that bad:
+	LDATA ReadOnlyData16 [
+		CmmAlign 16,
+		CmmDataLabel lbl,
+		CmmStaticLit (CmmInt 0x80000000 I32),
+		CmmStaticLit (CmmInt 0 I32),
+		CmmStaticLit (CmmInt 0 I32),
+		CmmStaticLit (CmmInt 0 I32)
+	],
+	XOR F32 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
+		-- xorps, so we need the 128-bit constant
+		-- ToDo: rip-relative
+	]
+  --
+  return (Any F32 code)
+
+getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do
+  lbl <- getNewLabelNat
+  let
+	-- This is how gcc does it, so it can't be that bad:
+    code dst = toOL [
+	LDATA ReadOnlyData16 [
+		CmmAlign 16,
+		CmmDataLabel lbl,
+		CmmStaticLit (CmmInt 0x8000000000000000 I64),
+		CmmStaticLit (CmmInt 0 I64)
+	],
+		-- gcc puts an unpck here.  Wonder if we need it.
+	XOR F64 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
+		-- xorpd, so we need the 128-bit constant
+		-- ToDo: rip-relative
+	]
+  --
+  return (Any F64 code)
+#endif
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 getRegister (CmmMachOp mop [x]) -- unary MachOps
   = case mop of
+#if i386_TARGET_ARCH
       MO_S_Neg F32 -> trivialUFCode F32 (GNEG F32) x
       MO_S_Neg F64 -> trivialUFCode F64 (GNEG F64) x
+#endif
 
       MO_S_Neg rep -> trivialUCode rep (NEGI rep) x
       MO_Not rep   -> trivialUCode rep (NOT  rep) x
@@ -805,6 +928,15 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
       MO_S_Conv I16 I8  -> conversionNop I16 x
       MO_U_Conv I32 I16 -> conversionNop I32 x
       MO_S_Conv I32 I16 -> conversionNop I32 x
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I64 I32 -> conversionNop I64 x
+      MO_S_Conv I64 I32 -> conversionNop I64 x
+      MO_U_Conv I64 I16 -> conversionNop I64 x
+      MO_S_Conv I64 I16 -> conversionNop I64 x
+      MO_U_Conv I64 I8  -> conversionNop I64 x
+      MO_S_Conv I64 I8  -> conversionNop I64 x
+#endif
+
       MO_U_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
       MO_S_Conv rep1 rep2 | rep1 == rep2 -> conversionNop rep1 x
 
@@ -817,12 +949,32 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
       MO_S_Conv I16 I32 -> integerExtend I16 I32 MOVSxL x
       MO_S_Conv I8  I16 -> integerExtend I8  I16 MOVSxL x
 
+#if x86_64_TARGET_ARCH
+      MO_U_Conv I8  I64 -> integerExtend I8  I64 MOVZxL x
+      MO_U_Conv I16 I64 -> integerExtend I16 I64 MOVZxL x
+      MO_U_Conv I32 I64 -> integerExtend I32 I64 MOVZxL x
+      MO_S_Conv I8  I64 -> integerExtend I8  I64 MOVSxL x
+      MO_S_Conv I16 I64 -> integerExtend I16 I64 MOVSxL x
+      MO_S_Conv I32 I64 -> integerExtend I32 I64 MOVSxL x
+	-- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+	-- However, we don't want the register allocator to throw it
+	-- away as an unnecessary reg-to-reg move, so we keep it in
+	-- the form of a movzl and print it as a movl later.
+#endif
+
+#if i386_TARGET_ARCH
       MO_S_Conv F32 F64 -> conversionNop F64 x
       MO_S_Conv F64 F32 -> conversionNop F32 x
+#else
+      MO_S_Conv F32 F64 -> coerceFP2FP F64 x
+      MO_S_Conv F64 F32 -> coerceFP2FP F32 x
+#endif
+
       MO_S_Conv from to
 	| isFloatingRep from -> coerceFP2Int from to x
 	| isFloatingRep to   -> coerceInt2FP from to x
 
+      other -> pprPanic "getRegister" (pprMachOp mop)
    where
 	-- signed or unsigned extension.
 	integerExtend from to instr expr = do
@@ -869,14 +1021,27 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
       MO_U_Lt rep -> condIntReg LU  x y
       MO_U_Le rep -> condIntReg LEU x y
 
-      MO_Add F32 -> trivialFCode F32  GADD x y
-      MO_Sub F32 -> trivialFCode F32  GSUB x y
+#if i386_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 GADD x y
+      MO_Sub F32 -> trivialFCode F32 GSUB x y
 
       MO_Add F64 -> trivialFCode F64 GADD x y
       MO_Sub F64 -> trivialFCode F64 GSUB x y
 
-      MO_S_Quot F32 -> trivialFCode  F32  GDIV x y
+      MO_S_Quot F32 -> trivialFCode F32 GDIV x y
       MO_S_Quot F64 -> trivialFCode F64 GDIV x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Add F32 -> trivialFCode F32 ADD x y
+      MO_Sub F32 -> trivialFCode F32 SUB x y
+
+      MO_Add F64 -> trivialFCode F64 ADD x y
+      MO_Sub F64 -> trivialFCode F64 SUB x y
+
+      MO_S_Quot F32 -> trivialFCode F32 FDIV x y
+      MO_S_Quot F64 -> trivialFCode F64 FDIV x y
+#endif
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -886,8 +1051,16 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
       MO_U_Quot rep -> div_code rep False True  x y
       MO_U_Rem  rep -> div_code rep False False x y
 
+#if i386_TARGET_ARCH
       MO_Mul   F32 -> trivialFCode F32 GMUL x y
       MO_Mul   F64 -> trivialFCode F64 GMUL x y
+#endif
+
+#if x86_64_TARGET_ARCH
+      MO_Mul   F32 -> trivialFCode F32 MUL x y
+      MO_Mul   F64 -> trivialFCode F64 MUL x y
+#endif
+
       MO_Mul   rep -> let op = IMUL rep in 
 		      trivialCode rep op (Just op) x y
 
@@ -912,24 +1085,24 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
   where
     --------------------
     imulMayOflo :: MachRep -> CmmExpr -> CmmExpr -> NatM Register
-    imulMayOflo I32 a b = do
-         res_lo <- getNewRegNat I32
-         res_hi <- getNewRegNat I32
+    imulMayOflo rep a b = do
+         res_lo <- getNewRegNat rep
+         res_hi <- getNewRegNat rep
          (a_reg, a_code) <- getNonClobberedReg a
          (b_reg, b_code) <- getSomeReg   b
          let 
              code dst = a_code `appOL` b_code `appOL`
                         toOL [
-                           MOV I32 (OpReg a_reg) (OpReg res_hi),
-                           MOV I32 (OpReg b_reg) (OpReg res_lo),
+                           MOV rep (OpReg a_reg) (OpReg res_hi),
+                           MOV rep (OpReg b_reg) (OpReg res_lo),
                            IMUL64 res_hi res_lo, 		-- result in res_hi:res_lo
-                           SAR I32 (OpImm (ImmInt 31)) (OpReg res_lo),	-- sign extend lower part
-                           SUB I32 (OpReg res_hi) (OpReg res_lo),	-- compare against upper
-                           MOV I32 (OpReg res_lo) (OpReg dst)
+                           SAR rep (OpImm (ImmInt 31)) (OpReg res_lo),	-- sign extend lower part
+                           SUB rep (OpReg res_hi) (OpReg res_lo),	-- compare against upper
+                           MOV rep (OpReg res_lo) (OpReg dst)
                            -- dst==0 if high part == sign extended low part
                         ]
          -- in
-	 return (Any I32 code)
+	 return (Any rep code)
 
     --------------------
     shift_code :: MachRep
@@ -987,7 +1160,7 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
 	   (y_op, y_code) <- getOperand y -- cannot be clobbered
 	   x_code <- getAnyReg x
 	   let
-	     widen | signed    = CLTD
+	     widen | signed    = CLTD rep
 		   | otherwise = XOR rep (OpReg edx) (OpReg edx)
 
 	     instr | signed    = IDIV
@@ -1004,17 +1177,18 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
            return (Fixed rep result code)
 
 
-
 getRegister (CmmLoad mem pk)
   | isFloatingRep pk
   = do
     Amode src mem_code <- getAmode mem
     let
     	code dst = mem_code `snocOL` 
-		   GLD pk src dst
+		   IF_ARCH_i386(GLD pk src dst,
+			        MOV pk (OpAddr src) (OpReg dst))
     --
     return (Any pk code)
 
+#if i386_TARGET_ARCH
 getRegister (CmmLoad mem pk)
   | pk /= I64
   = do 
@@ -1029,6 +1203,15 @@ getRegister (CmmLoad mem pk)
 	-- we can't guarantee access to an 8-bit variant of every register
 	-- (esi and edi don't have 8-bit variants), so to make things
 	-- simpler we do our 8-bit arithmetic with full 32-bit registers.
+#endif
+
+#if x86_64_TARGET_ARCH
+-- Simpler memory load code on x86_64
+getRegister (CmmLoad mem pk)
+  = do 
+    code <- intLoadCode (MOV pk) mem
+    return (Any pk code)
+#endif
 
 getRegister (CmmLit (CmmInt 0 rep))
   = let
@@ -1065,22 +1248,13 @@ anyReg :: Register -> NatM (Reg -> InstrBlock)
 anyReg (Any _ code)          = return code
 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
 
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-	tmp <- getNewRegNat rep
-	return (tmp, code tmp)
-    Fixed _ reg code -> 
-	return (reg, code)
-
 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
 -- Fixed registers might not be byte-addressable, so we make sure we've
 -- got a temporary, inserting an extra reg copy if necessary.
 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+#if x86_64_TARGET_ARCH
+getByteReg = getSomeReg -- all regs are byte-addressable on x86_64
+#else
 getByteReg expr = do
   r <- getRegister expr
   case r of
@@ -1094,6 +1268,7 @@ getByteReg expr = do
 	    return (tmp, code `snocOL` reg2reg rep reg tmp)
 	-- ToDo: could optimise slightly by checking for byte-addressable
 	-- real registers, but that will happen very rarely if at all.
+#endif
 
 -- Another variant: this time we want the result in a register that cannot
 -- be modified by code to evaluate an arbitrary expression.
@@ -1114,10 +1289,12 @@ getNonClobberedReg expr = do
 
 reg2reg :: MachRep -> Reg -> Reg -> Instr
 reg2reg rep src dst 
+#if i386_TARGET_ARCH
   | isFloatingRep rep = GMOV src dst
+#endif
   | otherwise	      = MOV rep (OpReg src) (OpReg dst)
 
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
@@ -1553,19 +1730,6 @@ extendSExpr rep x = CmmMachOp (MO_S_Conv rep I32) [x]
 extendUExpr I32 x = x
 extendUExpr rep x = CmmMachOp (MO_U_Conv rep I32) [x]
 
---  ###FIXME: exact code duplication from x86 case
--- The dual to getAnyReg: compute an expression into a register, but
--- we don't mind which one it is.
-getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
-getSomeReg expr = do
-  r <- getRegister expr
-  case r of
-    Any rep code -> do
-	tmp <- getNewRegNat rep
-	return (tmp, code tmp)
-    Fixed _ reg code -> 
-	return (reg, code)
-
 #endif /* powerpc_TARGET_ARCH */
 
 
@@ -1639,17 +1803,19 @@ getAmode other
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- This is all just ridiculous, since it carefully undoes 
 -- what mangleIndexTree has just done.
-getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
   -- ASSERT(rep == I32)???
   = do (x_reg, x_code) <- getSomeReg x
        let off = ImmInt (-(fromInteger i))
        return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
   
-getAmode (CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt i _)])
+getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)])
+  | not (is64BitLit lit)
   -- ASSERT(rep == I32)???
   = do (x_reg, x_code) <- getSomeReg x
        let off = ImmInt (fromInteger i)
@@ -1674,14 +1840,14 @@ getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _)
        return (Amode (AddrBaseIndex (Just x_reg) (Just (y_reg,base)) (ImmInt 0))
                code)
 
-getAmode (CmmLit lit)
+getAmode (CmmLit lit) | not (is64BitLit lit)
   = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
 
 getAmode expr = do
   (reg,code) <- getSomeReg expr
   return (Amode (AddrBaseIndex (Just reg) Nothing (ImmInt 0)) code)
 
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
@@ -1795,17 +1961,19 @@ getAmode other
 -- -----------------------------------------------------------------------------
 -- getOperand: sometimes any operand will do.
 
--- getOperand gets a *safe* operand; that is, the value of the operand
--- will remain valid across the computation of an arbitrary expression,
--- unless the expression is computed directly into a register which
--- the operand refers to (see trivialCode where this function is used
--- for an example).
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
 
-#ifdef i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
-getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-getOperand (CmmLoad mem pk) 
-  | not (isFloatingRep pk) && pk /= I64 = do
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
+    return (OpImm (litToImm lit), nilOL)
+getNonClobberedOperand (CmmLoad mem pk) 
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
     Amode src mem_code <- getAmode mem
     (src',save_code) <- 
 	if (amodeCouldBeClobbered src) 
@@ -1816,8 +1984,7 @@ getOperand (CmmLoad mem pk)
 		else
 		   return (src, nilOL)
     return (OpAddr src', save_code `appOL` mem_code)
-
-getOperand e = do
+getNonClobberedOperand e = do
     (reg, code) <- getNonClobberedReg e
     return (OpReg reg, code)
 
@@ -1827,6 +1994,39 @@ amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode)
 regClobbered (RealReg rr) = isFastTrue (freeReg rr)
 regClobbered _ = False
 
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+getOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
+    return (OpImm (litToImm lit), nilOL)
+getOperand (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getOperand e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+isOperand :: CmmExpr -> Bool
+isOperand (CmmLoad _ _) = True
+isOperand (CmmLit lit)  = not (is64BitLit lit) && 
+			  not (isFloatingRep (cmmLitRep lit))
+isOperand _             = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem (CmmLoad mem pk)
+  | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
+    Amode src mem_code <- getAmode mem
+    return (OpAddr src, mem_code)
+getRegOrMem e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+#if x86_64_TARGET_ARCH
+is64BitLit (CmmInt i I64) = i > 0x7fffffff || i < -0x80000000
+#endif
+is64BitLit x = False
 #endif
 
 -- -----------------------------------------------------------------------------
@@ -1846,7 +2046,7 @@ getCondCode = panic "MachCode.getCondCode: not on Alphas"
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH || sparc_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH || sparc_TARGET_ARCH
 -- yes, they really do seem to want exactly the same!
 
 getCondCode (CmmMachOp mop [x, y])
@@ -1940,10 +2140,10 @@ condFltCode = panic "MachCode.condFltCode: not on Alphas"
 #endif /* alpha_TARGET_ARCH */
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- memory vs immediate
-condIntCode cond (CmmLoad x pk) (CmmLit lit) = do
+condIntCode cond (CmmLoad x pk) (CmmLit lit) | not (is64BitLit lit) = do
     Amode x_addr x_code <- getAmode x
     let
 	imm  = litToImm lit
@@ -1961,50 +2161,29 @@ condIntCode cond x (CmmLit (CmmInt 0 pk)) = do
     --
     return (CondCode False cond code)
 
--- anything vs immediate
-condIntCode cond x (CmmLit lit) = do
-    (x_reg, x_code) <- getSomeReg x
+-- anything vs operand
+condIntCode cond x y | isOperand y = do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op,  y_code) <- getOperand y    
     let
-	imm  = litToImm lit
-	code = x_code `snocOL`
-                  CMP (cmmLitRep lit) (OpImm imm) (OpReg x_reg)
-    -- in
-    return (CondCode False cond code)
-
--- memory vs anything
-condIntCode cond (CmmLoad x pk) y = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-    	code = y_code `appOL`
-               x_code `snocOL`
-    	    	  CMP pk (OpReg y_reg) (OpAddr x_addr)
-    -- in
-    return (CondCode False cond code)
-
--- anything vs memory
-condIntCode cond y (CmmLoad x pk) = do
-    (y_reg, y_code) <- getNonClobberedReg y
-    Amode x_addr x_code <- getAmode x
-    let
-    	code = y_code `appOL`
-               x_code `snocOL`
-    	    	  CMP pk (OpAddr x_addr) (OpReg y_reg)
+	code = x_code `appOL` y_code `snocOL`
+                  CMP (cmmExprRep x) y_op (OpReg x_reg)
     -- in
     return (CondCode False cond code)
 
 -- anything vs anything
 condIntCode cond x y = do
-  (x_op, x_code) <- getOperand x
-  (y_reg, y_code) <- getSomeReg y
+  (y_reg, y_code) <- getNonClobberedReg y
+  (x_op, x_code) <- getRegOrMem x
   let
-	code = x_code `appOL`
-	       y_code `snocOL`
+	code = y_code `appOL`
+	       x_code `snocOL`
     	    	  CMP (cmmExprRep x) (OpReg y_reg) x_op
   -- in
   return (CondCode False cond code)
+#endif
 
------------
+#if i386_TARGET_ARCH
 condFltCode cond x y 
   = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
   (x_reg, x_code) <- getNonClobberedReg x
@@ -2015,9 +2194,25 @@ condFltCode cond x y
   -- The GCMP insn does the test and sets the zero flag if comparable
   -- and true.  Hence we always supply EQQ as the condition to test.
   return (CondCode True EQQ code)
-
 #endif /* i386_TARGET_ARCH */
 
+#if x86_64_TARGET_ARCH
+-- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+-- an operand, but the right must be a reg.  We can probably do better
+-- than this general case...
+condFltCode cond x y = do
+  (x_reg, x_code) <- getNonClobberedReg x
+  (y_op, y_code) <- getOperand y
+  let
+	code = x_code `appOL`
+	       y_code `snocOL`
+    	    	  CMP (cmmExprRep x) y_op (OpReg x_reg)
+  -- in
+  return (CondCode False (condToUnsigned cond) code)
+	-- we need to use the unsigned comparison operators on the
+	-- result of this comparison.
+#endif
+
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
 #if sparc_TARGET_ARCH
@@ -2171,7 +2366,7 @@ assignIntCode pk dst src
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- integer assignment to memory
 assignMem_IntCode pk addr src = do
@@ -2189,7 +2384,7 @@ assignMem_IntCode pk addr src = do
     return code
   where
     get_op_RI :: CmmExpr -> NatM (InstrBlock,Operand)	-- code, operator
-    get_op_RI (CmmLit lit)
+    get_op_RI (CmmLit lit) | not (is64BitLit lit)
       = return (nilOL, OpImm (litToImm lit))
     get_op_RI op
       = do (reg,code) <- getNonClobberedReg op
@@ -2298,7 +2493,7 @@ assignFltCode pk dst src
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- Floating point assignment to memory
 assignMem_FltCode pk addr src = do
@@ -2307,7 +2502,8 @@ assignMem_FltCode pk addr src = do
   let
 	code = src_code `appOL`
 	       addr_code `snocOL`
-                GST pk src_reg addr
+                IF_ARCH_i386(GST pk src_reg addr,
+		             MOV pk (OpReg src_reg) (OpAddr addr))
   return code
 
 -- Floating point assignment to a register/temporary
@@ -2416,7 +2612,7 @@ genJump tree
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 genJump (CmmLoad mem pk) = do
   Amode target code <- getAmode mem
@@ -2471,7 +2667,7 @@ genBranch :: BlockId -> NatM InstrBlock
 genBranch id = return (unitOL (BR id))
 #endif
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 genBranch id = return (unitOL (JXX ALWAYS id))
 #endif
 
@@ -2661,7 +2857,7 @@ genCondJump lbl (StPrim op [x, y])
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 genCondJump id bool = do
   CondCode _ cond code <- getCondCode bool
@@ -2835,7 +3031,7 @@ genCCall target dest_regs args vols = do
                toOL (
 			-- Deallocate parameters after call for ccall;
 			-- but not for stdcall (callee does it)
-                  (if cconv == StdCallConv then [] else 
+                  (if cconv == StdCallConv || tot_arg_size==0 then [] else 
 		   [ADD I32 (OpImm (ImmInt tot_arg_size)) (OpReg esp)])
                   ++
                   [DELTA (delta + tot_arg_size)]
@@ -2966,6 +3162,159 @@ outOfLineFloatOp mop res args vols
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
+#if x86_64_TARGET_ARCH
+
+genCCall (CmmPrim op) [(r,_)] args vols = 
+  panic "genCCall(CmmPrim)(x86_64)"
+
+genCCall target dest_regs args vols = do
+
+	-- load up the register arguments
+    (stack_args, sse_regs, load_args_code)
+	 <- load_args args allArgRegs allFPArgRegs 0 nilOL
+
+    let
+	tot_arg_size = arg_size * length stack_args
+
+	-- On entry to the called function, %rsp should be aligned
+	-- on a 16-byte boundary +8 (i.e. the first stack arg after
+	-- the return address is 16-byte aligned).  In STG land
+	-- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
+	-- need to make sure we push a multiple of 16-bytes of args,
+	-- plus the return address, to get the correct alignment.
+	-- Urg, this is hard.  We need to feed the delta back into
+	-- the arg pushing code.
+    (real_size, adjust_rsp) <-
+	if tot_arg_size `rem` 16 == 0
+	    then return (tot_arg_size, nilOL)
+	    else do -- we need to adjust...
+		delta <- getDeltaNat
+		setDeltaNat (delta-8)
+		return (tot_arg_size+8, toOL [
+				SUB I64 (OpImm (ImmInt 8)) (OpReg rsp),
+				DELTA (delta-8)
+			])
+
+	-- push the stack args, right to left
+    push_code <- push_args (reverse stack_args) nilOL
+    delta <- getDeltaNat
+
+    -- deal with static vs dynamic call targets
+    (callinsns,cconv) <-
+      case target of
+	-- CmmPrim -> ...
+        CmmForeignCall (CmmLit (CmmLabel lbl)) conv
+           -> -- ToDo: stdcall arg sizes
+	      return (unitOL (CALL (Left fn_imm)), conv)
+	   where fn_imm = ImmCLbl lbl
+        CmmForeignCall expr conv
+           -> do (dyn_r, dyn_c) <- getSomeReg expr
+		 return (dyn_c `snocOL` CALL (Right dyn_r), conv)
+
+    let
+	-- The x86_64 ABI requires us to set %al to the number of SSE
+	-- registers that contain arguments, if the called routine
+	-- is a varargs function.  We don't know whether it's a
+	-- varargs function or not, so we have to assume it is.
+	--
+	-- It's not safe to omit this assignment, even if the number
+	-- of SSE regs in use is zero.  If %al is larger than 8
+	-- on entry to a varargs function, seg faults ensue.
+	assign_eax n = unitOL (MOV I32 (OpImm (ImmInt n)) (OpReg eax))
+
+    let call = callinsns `appOL`
+               toOL (
+			-- Deallocate parameters after call for ccall;
+			-- but not for stdcall (callee does it)
+                  (if cconv == StdCallConv || real_size==0 then [] else 
+		   [ADD wordRep (OpImm (ImmInt real_size)) (OpReg esp)])
+                  ++
+                  [DELTA (delta + real_size)]
+               )
+    -- in
+    setDeltaNat (delta + real_size)
+
+    let
+	-- assign the results, if necessary
+	assign_code []     = nilOL
+	assign_code [(dest,_hint)] = 
+	  case rep of
+		F32 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+		F64 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
+		rep -> unitOL (MOV rep (OpReg rax) (OpReg r_dest))
+	  where 
+		rep = cmmRegRep dest
+		r_dest = getRegisterReg dest
+	assign_code many = panic "genCCall.assign_code many"
+
+    return (load_args_code 	`appOL` 
+	    adjust_rsp 		`appOL`
+	    push_code 		`appOL`
+	    assign_eax sse_regs `appOL`
+	    call 		`appOL` 
+	    assign_code dest_regs)
+
+  where
+    arg_size = 8 -- always, at the mo
+
+    load_args :: [(CmmExpr,MachHint)]
+	      -> [Reg] 			-- int regs avail for args
+	      -> [Reg] 			-- FP regs avail for args
+	      -> Int -> InstrBlock
+	      -> NatM ([(CmmExpr,MachHint)],Int,InstrBlock)
+    load_args args [] [] sse_regs code = return (args, sse_regs, code)
+	-- no more regs to use
+    load_args [] aregs fregs sse_regs code = return ([],sse_regs,code)
+	-- no more args to push
+    load_args ((arg,hint) : rest) aregs fregs sse_regs code
+	| isFloatingRep arg_rep = 
+	case fregs of
+	  [] -> push_this_arg
+	  (r:rs) -> do
+	     arg_code <- getAnyReg arg
+	     load_args rest aregs rs (sse_regs+1) (code `appOL` arg_code r)
+	| otherwise =
+	case aregs of
+	  [] -> push_this_arg
+	  (r:rs) -> do
+	     arg_code <- getAnyReg arg
+	     load_args rest rs fregs sse_regs (code `appOL` arg_code r)
+	where
+	  arg_rep = cmmExprRep arg
+
+	  push_this_arg = do
+	    (args',sse',code') <- load_args rest aregs fregs sse_regs code
+	    return ((arg,hint):args', sse', code')
+
+    push_args [] code = return code
+    push_args ((arg,hint):rest) code
+       | isFloatingRep arg_rep = do
+	 (arg_reg, arg_code) <- getSomeReg arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+	 let code' = code `appOL` toOL [
+			MOV arg_rep (OpReg arg_reg) (OpAddr  (spRel 0)),
+			SUB wordRep (OpImm (ImmInt arg_size)) (OpReg rsp) ,
+	 		DELTA (delta-arg_size)]
+	 push_args rest code'
+
+       | otherwise = do
+       -- we only ever generate word-sized function arguments.  Promotion
+       -- has already happened: our Int8# type is kept sign-extended
+       -- in an Int#, for example.
+	 ASSERT(arg_rep == I64) return ()
+	 (arg_op, arg_code) <- getOperand arg
+         delta <- getDeltaNat
+         setDeltaNat (delta-arg_size)
+	 let code' = code `appOL` toOL [PUSH I64 arg_op, 
+	 			        DELTA (delta-arg_size)]
+	 push_args rest code'
+	where
+	  arg_rep = cmmExprRep arg
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
 #if sparc_TARGET_ARCH
 {- 
    The SPARC calling convention is an absolute
@@ -3365,13 +3714,13 @@ genCCall target dest_regs argsAndHints vols
 
 genSwitch :: CmmExpr -> [Maybe BlockId] -> NatM InstrBlock
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 genSwitch expr ids = do
   (reg,e_code) <- getSomeReg expr
   lbl <- getNewLabelNat
   let
 	jumpTable = map jumpTableEntry ids
-	op = OpAddr (AddrBaseIndex Nothing (Just (reg,4)) (ImmCLbl lbl))
+	op = OpAddr (AddrBaseIndex Nothing (Just (reg,wORD_SIZE)) (ImmCLbl lbl))
 	code = e_code `appOL` toOL [
 		LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
 	        JMP_TBL op [ id | Just id <- ids ]
@@ -3455,7 +3804,7 @@ condFltReg = panic "MachCode.condFltReg (not on Alpha)"
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 condIntReg cond x y = do
   CondCode _ cond cond_code <- condIntCode cond x y
@@ -3658,9 +4007,11 @@ trivialCode
     -> IF_ARCH_alpha((Reg -> RI -> Reg -> Instr)
       ,IF_ARCH_i386 ((Operand -> Operand -> Instr) 
                      -> Maybe (Operand -> Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Operand -> Instr) 
+                     -> Maybe (Operand -> Operand -> Instr)
       ,IF_ARCH_sparc((Reg -> RI -> Reg -> Instr)
       ,IF_ARCH_powerpc(Bool -> (Reg -> Reg -> RI -> Instr)
-      ,))))
+      ,)))))
     -> CmmExpr -> CmmExpr -- the two arguments
     -> NatM Register
 
@@ -3670,7 +4021,8 @@ trivialFCode
     -> IF_ARCH_alpha((Reg -> Reg -> Reg -> Instr)
       ,IF_ARCH_sparc((Size -> Reg -> Reg -> Reg -> Instr)
       ,IF_ARCH_i386 ((MachRep -> Reg -> Reg -> Reg -> Instr)
-      ,)))
+      ,IF_ARCH_x86_64 ((MachRep -> Operand -> Operand -> Instr)
+      ,))))
     -> CmmExpr -> CmmExpr -- the two arguments
     -> NatM Register
 #endif
@@ -3679,9 +4031,10 @@ trivialUCode
     :: MachRep 
     -> IF_ARCH_alpha((RI -> Reg -> Instr)
       ,IF_ARCH_i386 ((Operand -> Instr)
+      ,IF_ARCH_x86_64 ((Operand -> Instr)
       ,IF_ARCH_sparc((RI -> Reg -> Instr)
       ,IF_ARCH_powerpc((Reg -> Reg -> Instr)
-      ,))))
+      ,)))))
     -> CmmExpr	-- the one argument
     -> NatM Register
 
@@ -3690,8 +4043,9 @@ trivialUFCode
     :: MachRep
     -> IF_ARCH_alpha((Reg -> Reg -> Instr)
       ,IF_ARCH_i386 ((Reg -> Reg -> Instr)
+      ,IF_ARCH_x86_64 ((Reg -> Reg -> Instr)
       ,IF_ARCH_sparc((Reg -> Reg -> Instr)
-      ,)))
+      ,))))
     -> CmmExpr -- the one argument
     -> NatM Register
 #endif
@@ -3770,7 +4124,7 @@ trivialUFCode _ instr x
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 {-
 The Rules of the Game are:
@@ -3817,16 +4171,8 @@ SDM's version of The Rules:
   register happens to be the destination register.
 -}
 
-trivialCode rep instr maybe_revinstr a (CmmLit lit_b) = do
-  a_code <- getAnyReg a
-  let
-       code dst
-          = a_code dst `snocOL` 
-	    instr (OpImm (litToImm lit_b)) (OpReg dst)
-  -- in
-  return (Any rep code)
-              
-trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
+trivialCode rep instr (Just revinstr) (CmmLit lit_a) b
+  | not (is64BitLit lit_a) = do
   b_code <- getAnyReg b
   let
        code dst 
@@ -3836,7 +4182,7 @@ trivialCode rep instr (Just revinstr) (CmmLit lit_a) b = do
   return (Any rep code)
 
 trivialCode rep instr maybe_revinstr a b = do
-  (b_op, b_code) <- getOperand b
+  (b_op, b_code) <- getNonClobberedOperand b
   a_code <- getAnyReg a
   tmp <- getNewRegNat rep
   let
@@ -3861,6 +4207,7 @@ trivialCode rep instr maybe_revinstr a b = do
  where
   reg `clashesWith` OpReg reg2   = reg == reg2
   reg `clashesWith` OpAddr amode = any (==reg) (addrModeRegs amode)
+  reg `clashesWith` _            = False
 
 -----------
 
@@ -3875,6 +4222,8 @@ trivialUCode rep instr x = do
 
 -----------
 
+#if i386_TARGET_ARCH
+
 trivialFCode pk instr x y = do
   (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
   (y_reg, y_code) <- getSomeReg y
@@ -3886,6 +4235,26 @@ trivialFCode pk instr x y = do
   -- in
   return (Any pk code)
 
+#endif
+
+#if x86_64_TARGET_ARCH
+
+-- We use the 2-operand SSE2 floating pt instructions.  ToDo: improve on
+-- this by using some of the special cases in trivialCode above.
+trivialFCode pk instr x y = do
+  (y_reg, y_code) <- getNonClobberedReg y -- these work for float regs too
+  x_code <- getAnyReg x
+  let
+     code dst =
+	y_code `appOL`
+	x_code dst `snocOL`
+	instr pk (IF_ARCH_x86_64(OpReg,) y_reg)
+		 (IF_ARCH_x86_64(OpReg,) dst)
+  -- in
+  return (Any pk code)
+
+#endif
+
 -------------
 
 trivialUFCode rep instr x = do
@@ -4076,7 +4445,7 @@ remainderCode rep div x y = do
 coerceInt2FP :: MachRep -> MachRep -> CmmExpr -> NatM Register
 coerceFP2Int :: MachRep -> MachRep -> CmmExpr -> NatM Register
 
-#ifdef sparc_TARGET_ARCH
+#if sparc_TARGET_ARCH
 coerceDbl2Flt :: CmmExpr -> NatM Register
 coerceFlt2Dbl :: CmmExpr -> NatM Register
 #endif
@@ -4144,6 +4513,37 @@ coerceFP2Int from to x = do
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
+#if x86_64_TARGET_ARCH
+
+coerceFP2Int from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case to of F32 -> CVTSS2SI; F64 -> CVTSD2SI
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceInt2FP from to x = do
+  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+  let
+        opc  = case to of F32 -> CVTSI2SS; F64 -> CVTSI2SD
+        code dst = x_code `snocOL` opc x_op dst
+  -- in
+  return (Any to code) -- works even if the destination rep is <I32
+
+coerceFP2FP :: MachRep -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+        opc  = case to of F32 -> CVTSD2SS; F64 -> CVTSS2SD
+        code dst = x_code `snocOL` opc x_reg dst
+  -- in
+  return (Any to code)
+
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
 #if sparc_TARGET_ARCH
 
 coerceInt2FP pk x
diff --git a/ghc/compiler/nativeGen/MachInstrs.hs b/ghc/compiler/nativeGen/MachInstrs.hs
index 1b662e3..0839694 100644
--- a/ghc/compiler/nativeGen/MachInstrs.hs
+++ b/ghc/compiler/nativeGen/MachInstrs.hs
@@ -14,24 +14,22 @@ module MachInstrs (
 
 	-- * Machine instructions
 	Instr(..),
-	Cond(..),
-#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
+	Cond(..), condUnsigned, condToSigned, condToUnsigned,
+
+#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
 	Size(..), machRepSize,
 #endif
 	RI(..),
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 	Operand(..),
+#endif
+#if i386_TARGET_ARCH
 	i386_insert_ffrees,
 #endif
 #if sparc_TARGET_ARCH
 	riZero, fpRelEA, moveSp, fPair,
 #endif
-#if powerpc_TARGET_ARCH
-	condUnsigned, condToSigned,
-#endif
-	DestInfo(..), hasDestInfo, pprDests,
-
     ) where
 
 #include "HsVersions.h"
@@ -42,7 +40,6 @@ import MachOp		( MachRep(..) )
 import CLabel           ( CLabel, pprCLabel )
 import Panic		( panic )
 import Outputable
-import Config           ( cLeadingUnderscore )
 import FastString
 
 import GLAEXTS
@@ -72,7 +69,7 @@ data Cond
   | ULE		-- For CMP only
   | ULT		-- For CMP only
 #endif
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
   = ALWAYS	-- What's really used? ToDo
   | EQQ
   | GE
@@ -122,6 +119,23 @@ data Cond
 #endif
     deriving Eq  -- to make an assertion work
 
+condUnsigned GU  = True
+condUnsigned LU  = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _   = False
+
+condToSigned GU  = GTT
+condToSigned LU  = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x   = x
+
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE  = GEU
+condToUnsigned LE  = LEU
+condToUnsigned x   = x
 
 -- -----------------------------------------------------------------------------
 -- Sizes on this architecture
@@ -129,7 +143,7 @@ data Cond
 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
 -- here.  I've removed them from the x86 version, we'll see what happens --SDM
 
-#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
+#if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
 data Size
 #if alpha_TARGET_ARCH
     = B	    -- byte
@@ -363,7 +377,7 @@ bit or 64 bit precision.
 --SDM 1/2003
 -}
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 -- data Instr continues...
 
@@ -371,6 +385,9 @@ bit or 64 bit precision.
 	| MOV	      MachRep Operand Operand
 	| MOVZxL      MachRep Operand Operand -- size is the size of operand 1
 	| MOVSxL      MachRep Operand Operand -- size is the size of operand 1
+	-- x86_64 note: plain mov into a 32-bit register always zero-extends
+	-- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
+	-- don't affect the high bits of the register.
 
 -- Load effective address (also a very useful three-operand add instruction :-)
 	| LEA         MachRep Operand Operand
@@ -379,9 +396,9 @@ bit or 64 bit precision.
 	| ADD	      MachRep Operand Operand
 	| ADC	      MachRep Operand Operand
 	| SUB	      MachRep Operand Operand
-	| IMUL	      MachRep Operand Operand	-- signed int mul
-	| MUL	      MachRep Operand Operand	-- unsigned int mul
 
+	| MUL	      MachRep Operand Operand
+	| IMUL	      MachRep Operand Operand	-- signed int mul
         | IMUL64      Reg Reg
 	-- operand1:operand2 := (operand1[31:0] *signed operand2[31:0])
 
@@ -403,6 +420,7 @@ bit or 64 bit precision.
         | BT          MachRep Imm Operand
 	| NOP
 
+#if i386_TARGET_ARCH
 -- Float Arithmetic.
 
 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles 
@@ -442,6 +460,32 @@ bit or 64 bit precision.
     	| GTAN	      MachRep Reg Reg -- src, dst
 	
         | GFREE         -- do ffree on all x86 regs; an ugly hack
+#endif
+
+#if x86_64_TARGET_ARCH
+-- SSE2 floating point: we use a restricted set of the available SSE2
+-- instructions for floating-point.
+
+	-- use MOV for moving (either movss or movsd (movlpd better?))
+
+	| CVTSS2SD	Reg Reg		-- F32 to F64
+	| CVTSD2SS	Reg Reg		-- F64 to F32
+	| CVTSS2SI	Operand Reg	-- F32 to I32/I64 (with rounding)
+	| CVTSD2SI	Operand	Reg	-- F64 to I32/I64 (with rounding)
+	| CVTSI2SS	Operand Reg	-- I32/I64 to F32
+	| CVTSI2SD	Operand Reg	-- I32/I64 to F64
+
+	-- use ADD & SUB for arithmetic.  In both cases, operands
+	-- are  Operand Reg.
+
+ 	-- SSE2 floating-point division:
+	| FDIV		MachRep Operand Operand   -- divisor, dividend(dst)
+
+	-- use CMP for comparisons.  ucomiss and ucomisd instructions
+	-- compare single/double prec floating point respectively.
+
+	| SQRT		MachRep Operand Reg	-- src, dst
+#endif
 
 -- Comparison
 	| TEST          MachRep Operand Operand
@@ -462,7 +506,7 @@ bit or 64 bit precision.
 	| CALL	      (Either Imm Reg)
 
 -- Other things.
-	| CLTD -- sign extend %eax into %edx:%eax
+	| CLTD MachRep	 -- sign extend %eax into %edx:%eax
 
 	| FETCHGOT    Reg  -- pseudo-insn for position-independent code
                            -- pretty-prints as
@@ -475,7 +519,9 @@ data Operand
   | OpImm  Imm	        -- immediate value
   | OpAddr AddrMode	-- memory reference
 
+#endif /* i386 or x86_64 */
 
+#if i386_TARGET_ARCH
 i386_insert_ffrees :: [Instr] -> [Instr]
 i386_insert_ffrees insns
    | any is_G_instr insns
@@ -506,7 +552,6 @@ is_G_instr instr
         GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True
         GFREE -> panic "is_G_instr: GFREE (!)"
         other -> False
-
 #endif /* i386_TARGET_ARCH */
 
 
@@ -670,33 +715,4 @@ fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
 	      | FETCHPC Reg            -- pseudo-instruction:
 	                               -- bcl to next insn, mflr reg
 	      
-condUnsigned GU = True
-condUnsigned LU = True
-condUnsigned GEU = True
-condUnsigned LEU = True
-condUnsigned _ = False
-
-condToSigned GU = GTT
-condToSigned LU = LTT
-condToSigned GEU = GE
-condToSigned LEU = LE
-condToSigned x = x
 #endif /* powerpc_TARGET_ARCH */
-
-
--- -----------------------------------------------------------------------------
--- DestInfo
-
--- ToDo: might not be needed anymore --SDM
-
--- used by insnFuture in RegAllocInfo.lhs
-data DestInfo
-   = NoDestInfo             -- no supplied dests; infer from context
-   | DestInfo [CLabel]      -- precisely these dests and no others
-
-hasDestInfo NoDestInfo   = False
-hasDestInfo (DestInfo _) = True
-
-pprDests :: DestInfo -> SDoc
-pprDests NoDestInfo      = text "NoDestInfo"
-pprDests (DestInfo dsts) = brackets (hsep (map pprCLabel dsts))
diff --git a/ghc/compiler/nativeGen/MachRegs.lhs b/ghc/compiler/nativeGen/MachRegs.lhs
index a3946a7..44448f6 100644
--- a/ghc/compiler/nativeGen/MachRegs.lhs
+++ b/ghc/compiler/nativeGen/MachRegs.lhs
@@ -51,6 +51,15 @@ module MachRegs (
 	fake0, fake1, fake2, fake3, fake4, fake5,
 	addrModeRegs,
 #endif
+#if x86_64_TARGET_ARCH
+	rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp,
+	eax, ebx, ecx, edx, esi, edi, ebp, esp,
+	r8, r9, r10, r11, r12, r13, r14, r15,
+  	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+  	xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+	xmm, eax, edx,
+	addrModeRegs, allFPArgRegs,
+#endif
 #if sparc_TARGET_ARCH
 	fits13Bits,
 	fpRel, gReg, iReg, lReg, oReg, largeOffsetError,
@@ -141,7 +150,7 @@ data AddrMode
   | AddrRegImm	Reg Imm
 #endif
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
   = AddrBaseIndex	Base Index Displacement
   | ImmAddr		Imm Int
 
@@ -160,7 +169,7 @@ type Displacement = Imm
   | AddrRegImm	Reg Imm
 #endif
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 addrModeRegs :: AddrMode -> [Reg]
 addrModeRegs (AddrBaseIndex b i _) =  b_regs ++ i_regs
   where
@@ -177,7 +186,7 @@ addrOffset addr off
 #if alpha_TARGET_ARCH
       _ -> panic "MachMisc.addrOffset not defined for Alpha"
 #endif
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
       ImmAddr i off0	  -> Just (ImmAddr i (off0 + off))
 
       AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off)))
@@ -280,8 +289,10 @@ spRel :: Int	-- desired stack offset in words, positive or negative
       -> AddrMode
 
 spRel n
-#if i386_TARGET_ARCH
+#if defined(i386_TARGET_ARCH)
   = AddrBaseIndex (Just esp) Nothing (ImmInt (n * wORD_SIZE))
+#elif defined(x86_64_TARGET_ARCH)
+  = AddrBaseIndex (Just rsp) Nothing (ImmInt (n * wORD_SIZE))
 #else
   = AddrRegImm sp (ImmInt (n * wORD_SIZE))
 #endif
@@ -497,6 +508,88 @@ showReg n
 #endif
 
 {-
+AMD x86_64 architecture:
+- Registers 0-16 have 32-bit counterparts (eax, ebx etc.)
+- Registers 0-7 have 16-bit counterparts (ax, bx etc.)
+- Registers 0-3 have 8 bit counterparts (ah, bh etc.)
+
+-}
+
+#if x86_64_TARGET_ARCH
+
+rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi, 
+  r8, r9, r10, r11, r12, r13, r14, r15,
+  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+  xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg
+
+rax   = RealReg 0
+rbx   = RealReg 1
+rcx   = RealReg 2
+rdx   = RealReg 3
+rsi   = RealReg 4
+rdi   = RealReg 5
+rbp   = RealReg 6
+rsp   = RealReg 7
+r8    = RealReg 8
+r9    = RealReg 9
+r10   = RealReg 10
+r11   = RealReg 11
+r12   = RealReg 12
+r13   = RealReg 13
+r14   = RealReg 14
+r15   = RealReg 15
+xmm0  = RealReg 16
+xmm1  = RealReg 17
+xmm2  = RealReg 18
+xmm3  = RealReg 19
+xmm4  = RealReg 20
+xmm5  = RealReg 21
+xmm6  = RealReg 22
+xmm7  = RealReg 23
+xmm8  = RealReg 24
+xmm9  = RealReg 25
+xmm10 = RealReg 26
+xmm11 = RealReg 27
+xmm12 = RealReg 28
+xmm13 = RealReg 29
+xmm14 = RealReg 30
+xmm15 = RealReg 31
+
+ -- so we can re-use some x86 code:
+eax = rax
+ebx = rbx
+ecx = rcx
+edx = rdx
+esi = rsi
+edi = rdi
+ebp = rbp
+esp = rsp
+
+xmm n = RealReg (16+n)
+
+-- On x86, we might want to have an 8-bit RegClass, which would
+-- contain just regs 1-4 (the others don't have 8-bit versions).
+-- However, we can get away without this at the moment because the
+-- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
+regClass (RealReg i)     = if i < 16 then RcInteger else RcDouble
+regClass (VirtualRegI  u) = RcInteger
+regClass (VirtualRegHi u) = RcInteger
+regClass (VirtualRegD  u) = RcDouble
+regClass (VirtualRegF  u) = pprPanic "regClass(x86_64):VirtualRegF" 
+                                    (ppr (VirtualRegF u))
+
+regNames 
+ = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ]
+
+showReg :: RegNo -> String
+showReg n
+  | n >= 16 = "%xmm" ++ show n  
+  | n >= 8  = "%r" ++ show n
+  | otherwise = regNames !! n
+
+#endif
+
+{-
 The SPARC has 64 registers of interest; 32 integer registers and 32
 floating point registers.  The mapping of STG registers to SPARC
 machine registers is defined in StgRegs.h.  We are, of course,
@@ -647,6 +740,42 @@ names in the header files.  Gag me with a spoon, eh?
 #define fake4 12
 #define fake5 13
 #endif
+
+#if x86_64_TARGET_ARCH
+#define rax   0
+#define rbx   1
+#define rcx   2
+#define rdx   3
+#define rsi   4
+#define rdi   5
+#define rbp   6
+#define rsp   7
+#define r8    8
+#define r9    9
+#define r10   10
+#define r11   11
+#define r12   12
+#define r13   13
+#define r14   14
+#define r15   15
+#define xmm0  16
+#define xmm1  17
+#define xmm2  18
+#define xmm3  19
+#define xmm4  20
+#define xmm5  21
+#define xmm6  22
+#define xmm7  23
+#define xmm8  24
+#define xmm9  25
+#define xmm10 26
+#define xmm11 27
+#define xmm12 28
+#define xmm13 29
+#define xmm14 30
+#define xmm15 31
+#endif
+
 #if sparc_TARGET_ARCH
 #define g0 0
 #define g1 1
@@ -824,11 +953,12 @@ allMachRegNos :: [RegNo]
 allMachRegNos
    = IF_ARCH_alpha( [0..63],
      IF_ARCH_i386(  [0..13],
+     IF_ARCH_x86_64( [0..31],
      IF_ARCH_sparc( ([0..31]
                      ++ [f0,f2 .. nCG_FirstFloatReg-1]
                      ++ [nCG_FirstFloatReg .. f31]),
      IF_ARCH_powerpc([0..63],
-                   ))))
+                   )))))
 
 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
 -- i.e., these are the regs for which we are prepared to allow the
@@ -854,6 +984,11 @@ callClobberedRegs
     -- caller-saves registers
     map RealReg [eax,ecx,edx,fake0,fake1,fake2,fake3,fake4,fake5]
 #endif /* i386_TARGET_ARCH */
+#if x86_64_TARGET_ARCH
+    -- caller-saves registers
+    map RealReg ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..31])
+       -- all xmm regs are caller-saves
+#endif /* x86_64_TARGET_ARCH */
 #if sparc_TARGET_ARCH
     map RealReg 
         ( oReg 7 :
@@ -880,6 +1015,10 @@ argRegs :: RegNo -> [Reg]
 argRegs _ = panic "MachRegs.argRegs(x86): should not be used!"
 #endif
 
+#if x86_64_TARGET_ARCH
+argRegs _ = panic "MachRegs.argRegs(x86_64): should not be used!"
+#endif
+
 #if alpha_TARGET_ARCH
 argRegs 0 = []
 argRegs 1 = freeMappedRegs [16, fReg 16]
@@ -932,6 +1071,13 @@ allArgRegs :: [Reg]
 allArgRegs = panic "MachRegs.allArgRegs(x86): should not be used!"
 #endif
 
+#if x86_64_TARGET_ARCH
+allArgRegs :: [Reg]
+allArgRegs = map RealReg [rdi,rsi,rdx,rcx,r8,r9]
+allFPArgRegs :: [Reg]
+allFPArgRegs = map RealReg [xmm0 .. xmm7]
+#endif
+
 #if powerpc_TARGET_ARCH
 allArgRegs :: [Reg]
 allArgRegs = map RealReg [3..10]
@@ -960,6 +1106,10 @@ freeReg 63 = fastBool False  -- always zero (f31)
 freeReg esp = fastBool False  --	%esp is the C stack pointer
 #endif
 
+#if x86_64_TARGET_ARCH
+freeReg rsp = fastBool False  --	%rsp is the C stack pointer
+#endif
+
 #if sparc_TARGET_ARCH
 freeReg g0 = fastBool False  --	%g0 is always 0.
 freeReg g5 = fastBool False  --	%g5 is reserved (ABI).
diff --git a/ghc/compiler/nativeGen/NCG.h b/ghc/compiler/nativeGen/NCG.h
index 78db0c9..b17f682 100644
--- a/ghc/compiler/nativeGen/NCG.h
+++ b/ghc/compiler/nativeGen/NCG.h
@@ -26,6 +26,12 @@
 # define IF_ARCH_i386(x,y) y
 #endif
 -- - - - - - - - - - - - - - - - - - - - - - 
+#if x86_64_TARGET_ARCH
+# define IF_ARCH_x86_64(x,y) x
+#else
+# define IF_ARCH_x86_64(x,y) y
+#endif
+-- - - - - - - - - - - - - - - - - - - - - - 
 #if freebsd_TARGET_OS
 # define IF_OS_freebsd(x,y) x
 #else
diff --git a/ghc/compiler/nativeGen/PprMach.hs b/ghc/compiler/nativeGen/PprMach.hs
index 26b192f..197a82a 100644
--- a/ghc/compiler/nativeGen/PprMach.hs
+++ b/ghc/compiler/nativeGen/PprMach.hs
@@ -2,8 +2,8 @@
 --
 -- Pretty-printing assembly language
 --
-        -- (c) The University of Glasgow 1993-2004
-        --
+-- (c) The University of Glasgow 1993-2005
+--
 -----------------------------------------------------------------------------
 
 -- We start with the @pprXXX@s with some cross-platform commonality
@@ -21,7 +21,7 @@ module PprMach (
 #include "HsVersions.h"
 
 import Cmm
-import MachOp		( MachRep(..) )
+import MachOp		( MachRep(..), wordRep, isFloatingRep )
 import MachRegs		-- may differ per-platform
 import MachInstrs
 
@@ -115,13 +115,13 @@ pprBasicBlock (BasicBlock (BlockId id) instrs) =
 -- on which bit of it we care about.  Yurgh.
 
 pprUserReg :: Reg -> Doc
-pprUserReg = pprReg IF_ARCH_i386(I32,)
+pprUserReg = pprReg IF_ARCH_i386(I32,) IF_ARCH_x86_64(I64,)
 
-pprReg :: IF_ARCH_i386(MachRep ->,) Reg -> Doc
+pprReg :: IF_ARCH_i386(MachRep ->, IF_ARCH_x86_64(MachRep ->,)) Reg -> Doc
 
-pprReg IF_ARCH_i386(s,) r
+pprReg IF_ARCH_i386(s, IF_ARCH_x86_64(s,)) r
   = case r of
-      RealReg i      -> ppr_reg_no IF_ARCH_i386(s,) i
+      RealReg i      -> ppr_reg_no IF_ARCH_i386(s, IF_ARCH_x86_64(s,)) i
       VirtualRegI  u  -> text "%vI_" <> asmSDoc (pprUnique u)
       VirtualRegHi u  -> text "%vHi_" <> asmSDoc (pprUnique u)
       VirtualRegF  u  -> text "%vF_" <> asmSDoc (pprUnique u)
@@ -200,6 +200,74 @@ pprReg IF_ARCH_i386(s,) r
 	_  -> SLIT("very naughty I386 register")
       })
 #endif
+
+#if x86_64_TARGET_ARCH
+    ppr_reg_no :: MachRep -> Int -> Doc
+    ppr_reg_no I8   = ppr_reg_byte
+    ppr_reg_no I16  = ppr_reg_word
+    ppr_reg_no I32  = ppr_reg_long
+    ppr_reg_no _    = ppr_reg_quad
+
+    ppr_reg_byte i = ptext
+      (case i of {
+	 0 -> SLIT("%al");     1 -> SLIT("%bl");
+	 2 -> SLIT("%cl");     3 -> SLIT("%dl");
+	 4 -> SLIT("%sil");    5 -> SLIT("%dil"); -- new 8-bit regs!
+	 6 -> SLIT("%bpl");    7 -> SLIT("%spl");
+	 8 -> SLIT("%r8b");    9  -> SLIT("%r9b");
+	10 -> SLIT("%r10b");   11 -> SLIT("%r11b");
+	12 -> SLIT("%r12b");   13 -> SLIT("%r13b");
+	14 -> SLIT("%r14b");   15 -> SLIT("%r15b");
+	_  -> SLIT("very naughty x86_64 byte register")
+      })
+
+    ppr_reg_word i = ptext
+      (case i of {
+	 0 -> SLIT("%ax");     1 -> SLIT("%bx");
+	 2 -> SLIT("%cx");     3 -> SLIT("%dx");
+	 4 -> SLIT("%si");     5 -> SLIT("%di");
+	 6 -> SLIT("%bp");     7 -> SLIT("%sp");
+	 8 -> SLIT("%r8w");    9  -> SLIT("%r9w");
+	10 -> SLIT("%r10w");   11 -> SLIT("%r11w");
+	12 -> SLIT("%r12w");   13 -> SLIT("%r13w");
+	14 -> SLIT("%r14w");   15 -> SLIT("%r15w");
+	_  -> SLIT("very naughty x86_64 word register")
+      })
+
+    ppr_reg_long i = ptext
+      (case i of {
+	 0 -> SLIT("%eax");    1  -> SLIT("%ebx");
+	 2 -> SLIT("%ecx");    3  -> SLIT("%edx");
+	 4 -> SLIT("%esi");    5  -> SLIT("%edi");
+	 6 -> SLIT("%ebp");    7  -> SLIT("%esp");
+	 8 -> SLIT("%r8d");    9  -> SLIT("%r9d");
+	10 -> SLIT("%r10d");   11 -> SLIT("%r11d");
+	12 -> SLIT("%r12d");   13 -> SLIT("%r13d");
+	14 -> SLIT("%r14d");   15 -> SLIT("%r15d");
+	_  -> SLIT("very naughty x86_64 register")
+      })
+
+    ppr_reg_quad i = ptext
+      (case i of {
+	 0 -> SLIT("%rax");	1 -> SLIT("%rbx");
+	 2 -> SLIT("%rcx");	3 -> SLIT("%rdx");
+	 4 -> SLIT("%rsi");	5 -> SLIT("%rdi");
+	 6 -> SLIT("%rbp");	7 -> SLIT("%rsp");
+	 8 -> SLIT("%r8");  	9 -> SLIT("%r9");
+	10 -> SLIT("%r10");    11 -> SLIT("%r11");
+	12 -> SLIT("%r12");    13 -> SLIT("%r13");
+	14 -> SLIT("%r14");    15 -> SLIT("%r15");
+	16 -> SLIT("%xmm0");   17 -> SLIT("%xmm1");
+	18 -> SLIT("%xmm2");   19 -> SLIT("%xmm3");
+	20 -> SLIT("%xmm4");   21 -> SLIT("%xmm5");
+	22 -> SLIT("%xmm6");   23 -> SLIT("%xmm7");
+	24 -> SLIT("%xmm8");   25 -> SLIT("%xmm9");
+	26 -> SLIT("%xmm10");  27 -> SLIT("%xmm11");
+	28 -> SLIT("%xmm12");  28 -> SLIT("%xmm13");
+	30 -> SLIT("%xmm13");  31 -> SLIT("%xmm15")
+      })
+#endif
+
 #if sparc_TARGET_ARCH
     ppr_reg_no :: Int -> Doc
     ppr_reg_no i = ptext
@@ -290,7 +358,7 @@ pprReg IF_ARCH_i386(s,) r
 -- -----------------------------------------------------------------------------
 -- pprSize: print a 'Size'
 
-#if powerpc_TARGET_ARCH || i386_TARGET_ARCH
+#if powerpc_TARGET_ARCH || i386_TARGET_ARCH || x86_64_TARGET_ARCH
 pprSize :: MachRep -> Doc
 #else
 pprSize :: Size -> Doc
@@ -310,14 +378,21 @@ pprSize x = ptext (case x of
 --	 SF -> SLIT("s") UNUSED
 	 TF -> SLIT("t")
 #endif
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 	I8   -> SLIT("b")
 	I16  -> SLIT("w")
 	I32  -> SLIT("l")
-	F32  -> SLIT("s")
-	F64  -> SLIT("l")
+	I64  -> SLIT("q")
+#endif
+#if i386_TARGET_ARCH
+	F32  -> SLIT("l")
+	F64  -> SLIT("q")
 	F80  -> SLIT("t")
 #endif
+#if x86_64_TARGET_ARCH
+	F32  -> SLIT("ss")	-- "scalar single-precision float" (SSE2)
+	F64  -> SLIT("sd")	-- "scalar double-precision float" (SSE2)
+#endif
 #if sparc_TARGET_ARCH
 	B   -> SLIT("sb")
 	Bu  -> SLIT("ub")
@@ -362,9 +437,9 @@ pprCond c = ptext (case c of {
 	GTT  -> SLIT("gt");
 	GE  -> SLIT("ge")
 #endif
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 	GEU	-> SLIT("ae");	LU    -> SLIT("b");
-	EQQ	-> SLIT("e");	GTT    -> SLIT("g");
+	EQQ	-> SLIT("e");	GTT   -> SLIT("g");
 	GE	-> SLIT("ge");	GU    -> SLIT("a");
 	LTT	-> SLIT("l");	LE    -> SLIT("le");
 	LEU	-> SLIT("be");	NE    -> SLIT("ne");
@@ -466,7 +541,7 @@ pprAddr (AddrRegImm r1 i)
 
 -------------------
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 pprAddr (ImmAddr imm off)
   = let	pp_imm = pprImm imm
     in
@@ -481,7 +556,7 @@ pprAddr (AddrBaseIndex base index displacement)
   = let
 	pp_disp  = ppr_disp displacement
 	pp_off p = pp_disp <> char '(' <> p <> char ')'
-	pp_reg r = pprReg I32 r
+	pp_reg r = pprReg wordRep r
     in
     case (base,index) of
       (Nothing, Nothing)    -> pp_disp
@@ -540,39 +615,59 @@ pprSectionHeader Text
 	IF_ARCH_alpha(SLIT("\t.text\n\t.align 3") {-word boundary-}
        ,IF_ARCH_sparc(SLIT(".text\n\t.align 4") {-word boundary-}
        ,IF_ARCH_i386(SLIT(".text\n\t.align 4,0x90") {-needs per-OS variation!-}
+       ,IF_ARCH_x86_64(SLIT(".text\n\t.align 8") {-needs per-OS variation!-}
        ,IF_ARCH_powerpc(SLIT(".text\n.align 2")
-       ,))))
+       ,)))))
 pprSectionHeader Data
     = ptext
 	 IF_ARCH_alpha(SLIT("\t.data\n\t.align 3")
 	,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -}
 	,IF_ARCH_i386(SLIT(".data\n\t.align 4")
+	,IF_ARCH_x86_64(SLIT(".data\n\t.align 8")
         ,IF_ARCH_powerpc(SLIT(".data\n.align 2")
-	,))))
+	,)))))
 pprSectionHeader ReadOnlyData
     = ptext
 	 IF_ARCH_alpha(SLIT("\t.data\n\t.align 3")
 	,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -}
 	,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 4")
+	,IF_ARCH_x86_64(SLIT(".section .rodata\n\t.align 8")
         ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const\n.align 2"),
                                       SLIT(".section .rodata\n\t.align 2"))
-	,))))
+	,)))))
 pprSectionHeader RelocatableReadOnlyData
     = ptext
 	 IF_ARCH_alpha(SLIT("\t.data\n\t.align 3")
 	,IF_ARCH_sparc(SLIT(".data\n\t.align 8") {-<8 will break double constants -}
-	,IF_ARCH_i386(SLIT(".data\n\t.align 4")
+	,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 4")
+	,IF_ARCH_x86_64(SLIT(".text\n\t.align 8")
         ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const_data\n.align 2"),
                                       SLIT(".data\n\t.align 2"))
-	,))))
+	,)))))
+	-- the assembler on x86_64/Linux refuses to generate code for
+	--   .quad  x - y
+	-- where x is in the text section and y in the rodata section.
+	-- It works if y is in the text section, though.  This is probably
+	-- going to cause difficulties for PIC, I imagine.
 pprSectionHeader UninitialisedData
     = ptext
 	 IF_ARCH_alpha(SLIT("\t.bss\n\t.align 3")
 	,IF_ARCH_sparc(SLIT(".bss\n\t.align 8") {-<8 will break double constants -}
 	,IF_ARCH_i386(SLIT(".section .bss\n\t.align 4")
+	,IF_ARCH_x86_64(SLIT(".section .bss\n\t.align 8")
         ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const_data\n.align 2"),
                                       SLIT(".section .bss\n\t.align 2"))
-	,))))
+	,)))))
+pprSectionHeader ReadOnlyData16
+    = ptext
+	 IF_ARCH_alpha(SLIT("\t.data\n\t.align 4")
+	,IF_ARCH_sparc(SLIT(".data\n\t.align 16")
+	,IF_ARCH_i386(SLIT(".section .rodata\n\t.align 16")
+	,IF_ARCH_x86_64(SLIT(".section .rodata.cst16\n\t.align 16")
+        ,IF_ARCH_powerpc(IF_OS_darwin(SLIT(".const\n.align 4"),
+                                      SLIT(".section .rodata\n\t.align 4"))
+	,)))))
+
 pprSectionHeader (OtherSection sec)
     = panic "PprMach.pprSectionHeader: unknown section"
 
@@ -586,11 +681,8 @@ pprData (CmmStaticLit lit)       = pprDataItem lit
 pprGloblDecl :: CLabel -> Doc
 pprGloblDecl lbl
   | not (externallyVisibleCLabel lbl) = empty
-  | otherwise = ptext IF_ARCH_alpha(SLIT(".globl ")
-		        ,IF_ARCH_i386(SLIT(".globl ")
-			,IF_ARCH_sparc(SLIT(".global ")
-		        ,IF_ARCH_powerpc(SLIT(".globl ")
-			,)))) <>
+  | otherwise = ptext IF_ARCH_sparc(SLIT(".global "), 
+				    SLIT(".globl ")) <>
 		pprCLabel_asm lbl
 
 pprLabel :: CLabel -> Doc
@@ -612,8 +704,9 @@ pprASCII str
 pprAlign bytes =
 	IF_ARCH_alpha(ptextSLIT(".align ") <> int pow2,
 	IF_ARCH_i386(ptext SLIT(".align ") <> int bytes,
+	IF_ARCH_x86_64(ptext SLIT(".align ") <> int bytes,
 	IF_ARCH_sparc(ptext SLIT(".align ") <> int bytes,
-	IF_ARCH_powerpc(ptext SLIT(".align ") <> int pow2,))))
+	IF_ARCH_powerpc(ptext SLIT(".align ") <> int pow2,)))))
   where
 	pow2 = log2 bytes
 	
@@ -646,7 +739,7 @@ pprDataItem lit
 	ppr_item I16  x = [ptext SLIT("\t.short\t") <> pprImm imm]
 	ppr_item I64  x = [ptext SLIT("\t.quad\t") <> pprImm imm]
 #endif
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 	ppr_item I16  x = [ptext SLIT("\t.word\t") <> pprImm imm]
 	ppr_item I64  x = [ptext SLIT("\t.quad\t") <> pprImm imm]
 #endif
@@ -672,10 +765,11 @@ pprInstr (COMMENT s)
    =  IF_ARCH_alpha( ((<>) (ptext SLIT("\t# ")) (ftext s))
      ,IF_ARCH_sparc( ((<>) (ptext SLIT("! "))   (ftext s))
      ,IF_ARCH_i386( ((<>) (ptext SLIT("# "))   (ftext s))
+     ,IF_ARCH_x86_64( ((<>) (ptext SLIT("# "))   (ftext s))
      ,IF_ARCH_powerpc( IF_OS_linux(
         ((<>) (ptext SLIT("# ")) (ftext s)),
         ((<>) (ptext SLIT("; ")) (ftext s)))
-     ,))))
+     ,)))))
 
 pprInstr (DELTA d)
    = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d)))
@@ -1071,7 +1165,7 @@ pprSizeRegRegReg name size reg1 reg2 reg3
 -- -----------------------------------------------------------------------------
 -- pprInstr for an x86
 
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 pprInstr v@(MOV size s@(OpReg src) d@(OpReg dst)) -- hack
   | src == dst
@@ -1081,10 +1175,18 @@ pprInstr v@(MOV size s@(OpReg src) d@(OpReg dst)) -- hack
 #else
     empty
 #endif
+
 pprInstr (MOV size src dst)
   = pprSizeOpOp SLIT("mov") size src dst
-pprInstr (MOVZxL sizes src dst) = pprSizeOpOpCoerce SLIT("movz") sizes I32 src dst
-pprInstr (MOVSxL sizes src dst) = pprSizeOpOpCoerce SLIT("movs") sizes I32 src dst
+
+pprInstr (MOVZxL I32 src dst) = pprSizeOpOp SLIT("mov") I32 src dst
+	-- 32-to-64 bit zero extension on x86_64 is accomplished by a simple
+	-- movl.  But we represent it as a MOVZxL instruction, because
+	-- the reg alloc would tend to throw away a plain reg-to-reg
+	-- move, and we still want it to do that.
+
+pprInstr (MOVZxL sizes src dst) = pprSizeOpOpCoerce SLIT("movz") sizes wordRep src dst
+pprInstr (MOVSxL sizes src dst) = pprSizeOpOpCoerce SLIT("movs") sizes wordRep src dst
 
 -- here we do some patching, since the physical registers are only set late
 -- in the code generation.
@@ -1117,11 +1219,13 @@ pprInstr (IMUL size op1 op2) = pprSizeOpOp SLIT("imul") size op1 op2
    however, cannot be used to determine if the upper half of the
    result is non-zero."  So there.  
 -} 
-pprInstr (MUL size op1 op2) = pprSizeOpOp SLIT("imul") size op1 op2
-
 pprInstr (AND size src dst) = pprSizeOpOp SLIT("and") size src dst
 pprInstr (OR  size src dst) = pprSizeOpOp SLIT("or")  size src dst
+
+pprInstr (XOR F32 src dst)  = pprOpOp SLIT("xorps") F32 src dst
+pprInstr (XOR F64 src dst)  = pprOpOp SLIT("xorpd") F64 src dst
 pprInstr (XOR size src dst) = pprSizeOpOp SLIT("xor")  size src dst
+
 pprInstr (NOT size op) = pprSizeOp SLIT("not") size op
 pprInstr (NEGI size op) = pprSizeOp SLIT("neg") size op
 
@@ -1131,7 +1235,10 @@ pprInstr (SHR size src dst) = pprShift SLIT("shr") size src dst
 
 pprInstr (BT  size imm src) = pprSizeImmOp SLIT("bt") size imm src
 
-pprInstr (CMP size src dst) = pprSizeOpOp SLIT("cmp")  size src dst
+pprInstr (CMP size src dst) 
+  | isFloatingRep size =  pprSizeOpOp SLIT("ucomi")  size src dst -- SSE2
+  | otherwise          =  pprSizeOpOp SLIT("cmp")  size src dst
+
 pprInstr (TEST size src dst) = pprSizeOpOp SLIT("test")  size src dst
 pprInstr (PUSH size op) = pprSizeOp SLIT("push") size op
 pprInstr (POP size op) = pprSizeOp SLIT("pop") size op
@@ -1141,7 +1248,8 @@ pprInstr (POP size op) = pprSizeOp SLIT("pop") size op
 -- pprInstr POPA = ptext SLIT("\tpopal")
 
 pprInstr NOP = ptext SLIT("\tnop")
-pprInstr CLTD = ptext SLIT("\tcltd")
+pprInstr (CLTD I32) = ptext SLIT("\tcltd")
+pprInstr (CLTD I64) = ptext SLIT("\tcqto")
 
 pprInstr (SETCC cond op) = pprCondInstr SLIT("set") cond (pprOperand I8 op)
 
@@ -1150,17 +1258,42 @@ pprInstr (JXX cond (BlockId id))
   where lab = mkAsmTempLabel id
 
 pprInstr (JMP (OpImm imm)) = (<>) (ptext SLIT("\tjmp ")) (pprImm imm)
-pprInstr (JMP op)          = (<>) (ptext SLIT("\tjmp *")) (pprOperand I32 op)
+pprInstr (JMP op)          = (<>) (ptext SLIT("\tjmp *")) (pprOperand wordRep op)
 pprInstr (JMP_TBL op ids)  = pprInstr (JMP op)
 pprInstr (CALL (Left imm))      = (<>) (ptext SLIT("\tcall ")) (pprImm imm)
-pprInstr (CALL (Right reg))     = (<>) (ptext SLIT("\tcall *")) (pprReg I32 reg)
+pprInstr (CALL (Right reg))     = (<>) (ptext SLIT("\tcall *")) (pprReg wordRep reg)
 
 pprInstr (IDIV sz op)	= pprSizeOp SLIT("idiv") sz op
 pprInstr (DIV sz op)    = pprSizeOp SLIT("div")  sz op
 
 pprInstr (IMUL64 sd_hi sd_lo) = pprInstr_imul64 sd_hi sd_lo
 
+#if x86_64_TARGET_ARCH
+pprInstr (MUL size op1 op2) = pprSizeOpOp SLIT("mul") size op1 op2
+
+pprInstr (FDIV size op1 op2) = pprSizeOpOp SLIT("div") size op1 op2
+
+pprInstr (CVTSS2SD from to) = pprRegReg SLIT("cvtss2sd") from to
+pprInstr (CVTSD2SS from to) = pprRegReg SLIT("cvtsd2ss") from to
+pprInstr (CVTSS2SI from to) = pprOpReg  SLIT("cvtss2si") from to
+pprInstr (CVTSD2SI from to) = pprOpReg  SLIT("cvtsd2si") from to
+pprInstr (CVTSI2SS from to) = pprOpReg  SLIT("cvtsi2ss") from to
+pprInstr (CVTSI2SD from to) = pprOpReg  SLIT("cvtsi2sd") from to
+#endif
+
+pprInstr (FETCHGOT reg)
+   = vcat [ ptext SLIT("\tcall 1f"),
+            hcat [ ptext SLIT("1:\tpopl\t"), pprReg I32 reg ],
+            hcat [ ptext SLIT("\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), "),
+                   pprReg I32 reg ]
+          ]
 
+#endif
+
+-- -----------------------------------------------------------------------------
+-- i386 floating-point
+
+#if i386_TARGET_ARCH
 -- Simulating a flat register set on the x86 FP stack is tricky.
 -- you have to free %st(7) before pushing anything on the FP reg stack
 -- so as to preclude the possibility of a FP stack overflow exception.
@@ -1357,31 +1490,6 @@ pprInstr GFREE
             ptext SLIT("\tffree %st(4) ;ffree %st(5) ;ffree %st(6) ;ffree %st(7)") 
           ]
 
-pprInstr (FETCHGOT reg)
-   = vcat [ ptext SLIT("\tcall 1f"),
-            hcat [ ptext SLIT("1:\tpopl\t"), pprReg I32 reg ],
-            hcat [ ptext SLIT("\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), "),
-                   pprReg I32 reg ]
-          ]
-
--- Emit code to make hi_reg:lo_reg be the 64-bit product of hi_reg and lo_reg
-pprInstr_imul64 hi_reg lo_reg
-   = let fakeInsn = text "imul64" <+> pp_hi_reg <> comma <+> pp_lo_reg
-         pp_hi_reg = pprReg I32 hi_reg
-         pp_lo_reg = pprReg I32 lo_reg
-     in     
-         vcat [
-            text "\t# BEGIN " <> fakeInsn,
-            text "\tpushl" <+> pp_hi_reg <> text" ;  pushl" <+> pp_lo_reg,
-            text "\tpushl %eax ; pushl %edx",
-            text "\tmovl 12(%esp), %eax ; imull 8(%esp)",
-            text "\tmovl %edx, 12(%esp) ; movl %eax, 8(%esp)",
-            text "\tpopl %edx ; popl %eax",
-            text "\tpopl" <+> pp_lo_reg <> text " ;  popl" <+> pp_hi_reg,
-            text "\t# END   " <> fakeInsn
-         ]
-
-
 --------------------------
 
 -- coerce %st(0) to the specified size
@@ -1431,7 +1539,26 @@ pprGInstr (GADD sz src1 src2 dst) = pprSizeRegRegReg SLIT("gadd") sz src1 src2 d
 pprGInstr (GSUB sz src1 src2 dst) = pprSizeRegRegReg SLIT("gsub") sz src1 src2 dst
 pprGInstr (GMUL sz src1 src2 dst) = pprSizeRegRegReg SLIT("gmul") sz src1 src2 dst
 pprGInstr (GDIV sz src1 src2 dst) = pprSizeRegRegReg SLIT("gdiv") sz src1 src2 dst
+#endif
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
+-- Emit code to make hi_reg:lo_reg be the 64-bit product of hi_reg and lo_reg
+pprInstr_imul64 hi_reg lo_reg
+   = let fakeInsn = text "imul64" <+> pp_hi_reg <> comma <+> pp_lo_reg
+         pp_hi_reg = pprReg wordRep hi_reg
+         pp_lo_reg = pprReg wordRep lo_reg
+     in     
+         vcat [
+            text "\t# BEGIN " <> fakeInsn,
+            text "\tpushl" <+> pp_hi_reg <> text" ;  pushl" <+> pp_lo_reg,
+            text "\tpushl %eax ; pushl %edx",
+            text "\tmovl 12(%esp), %eax ; imull 8(%esp)",
+            text "\tmovl %edx, 12(%esp) ; movl %eax, 8(%esp)",
+            text "\tpopl %edx ; popl %eax",
+            text "\tpopl" <+> pp_lo_reg <> text " ;  popl" <+> pp_hi_reg,
+            text "\t# END   " <> fakeInsn
+         ]
 -- Continue with I386-only printing bits and bobs:
 
 pprDollImm :: Imm -> Doc
@@ -1443,6 +1570,10 @@ pprOperand s (OpReg r)   = pprReg s r
 pprOperand s (OpImm i)   = pprDollImm i
 pprOperand s (OpAddr ea) = pprAddr ea
 
+pprMnemonic_  :: LitString -> Doc
+pprMnemonic_ name = 
+   char '\t' <> ptext name <> space
+
 pprMnemonic  :: LitString -> MachRep -> Doc
 pprMnemonic name size = 
    char '\t' <> ptext name <> pprSize size <> space
@@ -1473,6 +1604,15 @@ pprSizeOpOp name size op1 op2
 	pprOperand size op2
     ]
 
+pprOpOp :: LitString -> MachRep -> Operand -> Operand -> Doc
+pprOpOp name size op1 op2
+  = hcat [
+	pprMnemonic_ name,
+	pprOperand size op1,
+	comma,
+	pprOperand size op2
+    ]
+
 pprSizeReg :: LitString -> MachRep -> Reg -> Doc
 pprSizeReg name size reg1
   = hcat [
@@ -1489,6 +1629,24 @@ pprSizeRegReg name size reg1 reg2
         pprReg size reg2
     ]
 
+pprRegReg :: LitString -> Reg -> Reg -> Doc
+pprRegReg name reg1 reg2
+  = hcat [
+	pprMnemonic_ name,
+	pprReg wordRep reg1,
+        comma,
+        pprReg wordRep reg2
+    ]
+
+pprOpReg :: LitString -> Operand -> Reg -> Doc
+pprOpReg name op1 reg2
+  = hcat [
+	pprMnemonic_ name,
+	pprOperand wordRep op1,
+        comma,
+        pprReg wordRep reg2
+    ]
+
 pprCondRegReg :: LitString -> MachRep -> Cond -> Reg -> Reg -> Doc
 pprCondRegReg name size cond reg1 reg2
   = hcat [
diff --git a/ghc/compiler/nativeGen/RegAllocInfo.hs b/ghc/compiler/nativeGen/RegAllocInfo.hs
index 7d1bf48..6b929e5 100644
--- a/ghc/compiler/nativeGen/RegAllocInfo.hs
+++ b/ghc/compiler/nativeGen/RegAllocInfo.hs
@@ -24,7 +24,7 @@ module RegAllocInfo (
 #include "HsVersions.h"
 
 import Cmm		( BlockId )
-#if powerpc_TARGET_ARCH || i386_TARGET_ARCH
+#if powerpc_TARGET_ARCH || i386_TARGET_ARCH || x86_64_TARGET_ARCH
 import MachOp           ( MachRep(..) )
 #endif
 import MachInstrs
@@ -138,7 +138,7 @@ regUsage instr = case instr of
 
 #endif /* alpha_TARGET_ARCH */
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 regUsage instr = case instr of
     MOV    sz src dst	-> usageRW src dst
@@ -173,9 +173,10 @@ regUsage instr = case instr of
     JMP_TBL op ids      -> mkRU (use_R op) []
     CALL   (Left imm)	-> mkRU [] callClobberedRegs
     CALL   (Right reg)	-> mkRU [reg] callClobberedRegs
-    CLTD		-> mkRU [eax] [edx]
+    CLTD   sz		-> mkRU [eax] [edx]
     NOP			-> mkRU [] []
 
+#if i386_TARGET_ARCH
     GMOV   src dst	-> mkRU [src] [dst]
     GLD    sz src dst	-> mkRU (use_EA src) [dst]
     GST    sz src dst	-> mkRU (src : use_EA dst) []
@@ -201,6 +202,17 @@ regUsage instr = case instr of
     GSIN   sz src dst	-> mkRU [src] [dst]
     GCOS   sz src dst	-> mkRU [src] [dst]
     GTAN   sz src dst	-> mkRU [src] [dst]
+#endif
+
+#if x86_64_TARGET_ARCH
+    CVTSS2SD src dst	-> mkRU [src] [dst]
+    CVTSD2SS src dst	-> mkRU [src] [dst]
+    CVTSS2SI src dst	-> mkRU (use_R src) [dst]
+    CVTSD2SI src dst	-> mkRU (use_R src) [dst]
+    CVTSI2SS src dst	-> mkRU (use_R src) [dst]
+    CVTSI2SD src dst	-> mkRU (use_R src) [dst]
+    FDIV sz src dst     -> usageRM src dst
+#endif    
 
     FETCHGOT reg        -> mkRU [] [reg]
 
@@ -244,7 +256,7 @@ regUsage instr = case instr of
     mkRU src dst = RU (filter interesting src)
   	    	      (filter interesting dst)
 
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 #if sparc_TARGET_ARCH
 
@@ -370,7 +382,7 @@ regUsage instr = case instr of
 jumpDests :: Instr -> [BlockId] -> [BlockId]
 jumpDests insn acc
   = case insn of
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 	JXX _ id	-> id : acc
 	JMP_TBL _ ids	-> ids ++ acc
 #elif powerpc_TARGET_ARCH
@@ -445,7 +457,7 @@ patchRegs instr env = case instr of
 
 #endif /* alpha_TARGET_ARCH */
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-#if i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 
 patchRegs instr env = case instr of
     MOV  sz src dst	-> patch2 (MOV  sz) src dst
@@ -477,6 +489,7 @@ patchRegs instr env = case instr of
     JMP op		-> patch1 JMP op
     JMP_TBL op ids      -> patch1 JMP_TBL op $ ids
 
+#if i386_TARGET_ARCH
     GMOV src dst	-> GMOV (env src) (env dst)
     GLD sz src dst	-> GLD sz (lookupAddr src) (env dst)
     GST sz src dst	-> GST sz (env src) (lookupAddr dst)
@@ -502,6 +515,17 @@ patchRegs instr env = case instr of
     GSIN sz src dst	-> GSIN sz (env src) (env dst)
     GCOS sz src dst	-> GCOS sz (env src) (env dst)
     GTAN sz src dst	-> GTAN sz (env src) (env dst)
+#endif
+
+#if x86_64_TARGET_ARCH
+    CVTSS2SD src dst	-> CVTSS2SD (env src) (env dst)
+    CVTSD2SS src dst	-> CVTSD2SS (env src) (env dst)
+    CVTSS2SI src dst	-> CVTSS2SI (patchOp src) (env dst)
+    CVTSD2SI src dst	-> CVTSD2SI (patchOp src) (env dst)
+    CVTSI2SS src dst	-> CVTSI2SS (patchOp src) (env dst)
+    CVTSI2SD src dst	-> CVTSI2SD (patchOp src) (env dst)
+    FDIV sz src dst	-> FDIV sz (patchOp src) (patchOp dst)
+#endif    
 
     CALL (Left imm)	-> instr
     CALL (Right reg)	-> CALL (Right (env reg))
@@ -512,7 +536,7 @@ patchRegs instr env = case instr of
     COMMENT _		-> instr
     DELTA _ 		-> instr
     JXX _ _		-> instr
-    CLTD		-> instr
+    CLTD _		-> instr
 
     _other		-> panic "patchRegs: unrecognised instr"
 
@@ -534,7 +558,7 @@ patchRegs instr env = case instr of
 	lookupIndex Nothing      = Nothing
 	lookupIndex (Just (r,i)) = Just (env r, i)
 
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH*/
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 #if sparc_TARGET_ARCH
 
@@ -645,7 +669,7 @@ patchRegs instr env = case instr of
 -- by assigning the src and dest temporaries to the same real register.
 
 isRegRegMove :: Instr -> Maybe (Reg,Reg)
-#ifdef i386_TARGET_ARCH
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 -- TMP:
 isRegRegMove (MOV _ (OpReg r1) (OpReg r2)) = Just (r1,r2)
 #elif powerpc_TARGET_ARCH
@@ -678,6 +702,12 @@ mkSpillInstr reg delta slot
 	   RcInteger -> MOV I32 (OpReg reg) (OpAddr (spRel off_w))
 	   _         -> GST F80 reg (spRel off_w) {- RcFloat/RcDouble -}
 #endif
+#ifdef x86_64_TARGET_ARCH
+    let off_w = (off-delta) `div` 8
+    in case regClass reg of
+	   RcInteger -> MOV I64 (OpReg reg) (OpAddr (spRel off_w))
+	   _         -> panic "mkSpillInstr: ToDo"
+#endif
 #ifdef sparc_TARGET_ARCH
 	{-SPARC: spill below frame pointer leaving 2 words/spill-}
                         let{off_w = 1 + (off `div` 4);
@@ -705,16 +735,22 @@ mkLoadInstr reg delta slot
     let
         off     = spillSlotToOffset slot
     in
-#ifdef alpha_TARGET_ARCH
+#if alpha_TARGET_ARCH
 	 LD  sz dyn (spRel (- (off `div` 8)))
 #endif
-#ifdef i386_TARGET_ARCH
+#if i386_TARGET_ARCH
 	let off_w = (off-delta) `div` 4
         in case regClass reg of {
               RcInteger -> MOV I32 (OpAddr (spRel off_w)) (OpReg reg);
               _         -> GLD F80 (spRel off_w) reg} {- RcFloat/RcDouble -}
 #endif
-#ifdef sparc_TARGET_ARCH
+#if x86_64_TARGET_ARCH
+	let off_w = (off-delta) `div` 8
+        in case regClass reg of
+              RcInteger -> MOV I64 (OpAddr (spRel off_w)) (OpReg reg)
+              _         -> panic "mkLoadInstr: ToDo"
+#endif
+#if sparc_TARGET_ARCH
         let{off_w = 1 + (off `div` 4);
             sz = case regClass vreg of {
                    RcInteger -> W;
@@ -722,7 +758,7 @@ mkLoadInstr reg delta slot
                    RcDouble  -> DF}}
         in LD sz (fpRel (- off_w)) dyn
 #endif
-#ifdef powerpc_TARGET_ARCH
+#if powerpc_TARGET_ARCH
     let sz = case regClass reg of
                 RcInteger -> I32
                 RcDouble -> F64
-- 
1.7.10.4