From 335b9f366ac440259318777c4c07e4fa42fbbec6 Mon Sep 17 00:00:00 2001
From: Simon Marlow <marlowsd@gmail.com>
Date: Thu, 4 Feb 2010 10:48:49 +0000
Subject: [PATCH] Implement SSE2 floating-point support in the x86 native code
 generator (#594)

The new flag -msse2 enables code generation for SSE2 on x86.  It
results in substantially faster floating-point performance; the main
reason for doing this was that our x87 code generation is appallingly
bad, and since we plan to drop -fvia-C soon, we need a way to generate
half-decent floating-point code.

The catch is that SSE2 is only available on CPUs that support it (P4+,
AMD K8+).  We'll have to think hard about whether we should enable it
by default for the libraries we ship.  In the meantime, at least
-msse2 should be an acceptable replacement for "-fvia-C
-optc-ffast-math -fexcess-precision".

SSE2 also has the advantage of performing all operations at the
correct precision, so floating-point results are consistent with other
platforms.

I also tweaked the x87 code generation a bit while I was here, now
it's slighlty less bad than before.
---
 compiler/main/DynFlags.hs                          |    4 +
 compiler/nativeGen/PPC/Ppr.hs                      |    1 +
 compiler/nativeGen/PPC/Regs.hs                     |   29 +-
 compiler/nativeGen/Reg.hs                          |    5 +
 compiler/nativeGen/RegAlloc/Graph/Main.hs          |   18 +-
 compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs |   18 +-
 compiler/nativeGen/RegClass.hs                     |    3 +
 compiler/nativeGen/SPARC/Instr.hs                  |    3 +
 compiler/nativeGen/SPARC/Ppr.hs                    |    1 +
 compiler/nativeGen/SPARC/Regs.hs                   |   14 +-
 compiler/nativeGen/X86/CodeGen.hs                  |  796 ++++++++++----------
 compiler/nativeGen/X86/Instr.hs                    |   86 +--
 compiler/nativeGen/X86/Ppr.hs                      |   93 +--
 compiler/nativeGen/X86/RegInfo.hs                  |   34 +-
 compiler/nativeGen/X86/Regs.hs                     |  243 ++----
 docs/users_guide/flags.xml                         |    8 +
 docs/users_guide/using.xml                         |   20 +-
 17 files changed, 630 insertions(+), 746 deletions(-)

diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 4ba19b0..abef731 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -312,6 +312,7 @@ data DynFlag
    | Opt_EmitExternalCore
    | Opt_SharedImplib
    | Opt_BuildingCabalPackage
+   | Opt_SSE2
 
 	-- temporary flags
    | Opt_RunCPS
@@ -1265,6 +1266,9 @@ dynamic_flags = [
   , Flag "monly-4-regs" (NoArg (upd (\s -> s{stolen_x86_regs = 4}) ))
          Supported
 
+  , Flag "msse2" (NoArg (setDynFlag Opt_SSE2))
+         Supported
+
      ------ Warning opts -------------------------------------------------
   , Flag "W"      (NoArg (mapM_ setDynFlag   minusWOpts))
          Supported
diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs
index ec6d941..2d8f044 100644
--- a/compiler/nativeGen/PPC/Ppr.hs
+++ b/compiler/nativeGen/PPC/Ppr.hs
@@ -171,6 +171,7 @@ pprReg r
       RegVirtual (VirtualRegHi u)  -> text "%vHi_" <> asmSDoc (pprUnique u)
       RegVirtual (VirtualRegF  u)  -> text "%vF_" <> asmSDoc (pprUnique u)
       RegVirtual (VirtualRegD  u)  -> text "%vD_" <> asmSDoc (pprUnique u)
+      RegVirtual (VirtualRegSSE  u) -> text "%vSSE_" <> asmSDoc (pprUnique u)
   where
 #if darwin_TARGET_OS
     ppr_reg_no :: Int -> Doc
diff --git a/compiler/nativeGen/PPC/Regs.hs b/compiler/nativeGen/PPC/Regs.hs
index 18f06ed..d649d84 100644
--- a/compiler/nativeGen/PPC/Regs.hs
+++ b/compiler/nativeGen/PPC/Regs.hs
@@ -87,25 +87,15 @@ virtualRegSqueeze cls vr
 	 -> case vr of
 	 	VirtualRegI{}		-> _ILIT(1)
 		VirtualRegHi{}		-> _ILIT(1)
-		VirtualRegD{}		-> _ILIT(0)
-		VirtualRegF{}		-> _ILIT(0)
-
-	-- We don't use floats on this arch, but we can't
-	--	return error because the return type is unboxed...
-	RcFloat
-	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
-		VirtualRegD{}		-> _ILIT(0)
-		VirtualRegF{}		-> _ILIT(0)
+                _other                  -> _ILIT(0)
 
 	RcDouble
 	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
 		VirtualRegD{}		-> _ILIT(1)
 		VirtualRegF{}		-> _ILIT(0)
+                _other                  -> _ILIT(0)
 
+        _other -> _ILIT(0)
 
 {-# INLINE realRegSqueeze #-}
 realRegSqueeze :: RegClass -> RealReg -> FastInt
@@ -119,16 +109,6 @@ realRegSqueeze cls rr
 			
 		RealRegPair{}		-> _ILIT(0)
 
-	-- We don't use floats on this arch, but we can't
-	--	return error because the return type is unboxed...
-	RcFloat
-	 -> case rr of
-	 	RealRegSingle regNo
-			| regNo	< 32	-> _ILIT(0)
-			| otherwise	-> _ILIT(0)
-			
-		RealRegPair{}		-> _ILIT(0)
-
 	RcDouble
 	 -> case rr of
 	 	RealRegSingle regNo
@@ -137,6 +117,8 @@ realRegSqueeze cls rr
 			
 		RealRegPair{}		-> _ILIT(0)
 
+        _other -> _ILIT(0)
+
 mkVirtualReg :: Unique -> Size -> VirtualReg
 mkVirtualReg u size
    | not (isFloatSize size) = VirtualRegI u
@@ -152,6 +134,7 @@ regDotColor reg
         RcInteger       -> Outputable.text "blue"
         RcFloat         -> Outputable.text "red"
         RcDouble        -> Outputable.text "green"
+        RcDoubleSSE     -> Outputable.text "yellow"
 
 
 -- immediates ------------------------------------------------------------------
diff --git a/compiler/nativeGen/Reg.hs b/compiler/nativeGen/Reg.hs
index 422ea24..27315ba 100644
--- a/compiler/nativeGen/Reg.hs
+++ b/compiler/nativeGen/Reg.hs
@@ -55,6 +55,7 @@ data VirtualReg
 	| VirtualRegHi {-# UNPACK #-} !Unique  -- High part of 2-word register
 	| VirtualRegF  {-# UNPACK #-} !Unique
 	| VirtualRegD  {-# UNPACK #-} !Unique
+	| VirtualRegSSE {-# UNPACK #-} !Unique
 	deriving (Eq, Show, Ord)
 
 instance Uniquable VirtualReg where
@@ -64,6 +65,7 @@ instance Uniquable VirtualReg where
 		VirtualRegHi u	-> u
 		VirtualRegF u	-> u
 		VirtualRegD u	-> u
+		VirtualRegSSE u	-> u
 
 instance Outputable VirtualReg where
 	ppr reg
@@ -72,6 +74,7 @@ instance Outputable VirtualReg where
 		VirtualRegHi u  -> text "%vHi_" <> pprUnique u
 		VirtualRegF  u  -> text "%vF_" 	<> pprUnique u
 		VirtualRegD  u  -> text "%vD_" 	<> pprUnique u
+		VirtualRegSSE u -> text "%vSSE_" <> pprUnique u
 
 
 renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
@@ -81,6 +84,7 @@ renameVirtualReg u r
 	VirtualRegHi _	-> VirtualRegHi u
 	VirtualRegF _	-> VirtualRegF  u
 	VirtualRegD _	-> VirtualRegD  u
+	VirtualRegSSE _	-> VirtualRegSSE u
 
 
 classOfVirtualReg :: VirtualReg -> RegClass
@@ -90,6 +94,7 @@ classOfVirtualReg vr
 	VirtualRegHi{}	-> RcInteger
 	VirtualRegF{}	-> RcFloat
 	VirtualRegD{}	-> RcDouble
+	VirtualRegSSE{}	-> RcDoubleSSE
 
 
 -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
diff --git a/compiler/nativeGen/RegAlloc/Graph/Main.hs b/compiler/nativeGen/RegAlloc/Graph/Main.hs
index 5fa771c..35ec879 100644
--- a/compiler/nativeGen/RegAlloc/Graph/Main.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/Main.hs
@@ -380,25 +380,13 @@ seqNode node
 	`seq` (seqVirtualRegList (uniqSetToList (Color.nodeCoalesce node)))
 
 seqVirtualReg :: VirtualReg -> ()
-seqVirtualReg reg
- = case reg of
-	VirtualRegI _	-> ()
-	VirtualRegHi _	-> ()
-	VirtualRegF _	-> ()
-	VirtualRegD _	-> ()
+seqVirtualReg reg = reg `seq` ()
 
 seqRealReg :: RealReg -> ()
-seqRealReg reg
- = case reg of
- 	RealRegSingle _ -> ()
-	RealRegPair _ _	-> ()
+seqRealReg reg = reg `seq` ()
 
 seqRegClass :: RegClass -> ()
-seqRegClass c
- = case c of
- 	RcInteger	-> ()
-	RcFloat		-> ()
-	RcDouble	-> ()
+seqRegClass c = c `seq` ()
 
 seqMaybeRealReg :: Maybe RealReg -> ()
 seqMaybeRealReg mr
diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
index fd0faae..2f10178 100644
--- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
@@ -50,24 +50,27 @@ import FastTypes
 #define ALLOCATABLE_REGS_INTEGER (_ILIT(3))
 #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(6))
 #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(16))
 
 
 #elif x86_64_TARGET_ARCH
 #define ALLOCATABLE_REGS_INTEGER (_ILIT(5))
-#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(2))
+#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(0))
 #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
-
+#define ALLOCATABLE_REGS_SSE     (_ILIT(10))
 
 #elif powerpc_TARGET_ARCH
 #define ALLOCATABLE_REGS_INTEGER (_ILIT(16))
 #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(26))
 #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(0))
 
 
 #elif sparc_TARGET_ARCH
 #define ALLOCATABLE_REGS_INTEGER (_ILIT(14))
 #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(11))
 #define ALLOCATABLE_REGS_FLOAT   (_ILIT(22))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(0))
 
 
 #else
@@ -139,6 +142,17 @@ trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
 
 	= count3 <# ALLOCATABLE_REGS_DOUBLE
 
+trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
+	| count2	<- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_SSE
+				(virtualRegSqueeze RcDoubleSSE)
+				conflicts
+				
+	, count3	<- accSqueeze  count2    ALLOCATABLE_REGS_SSE
+				(realRegSqueeze   RcDoubleSSE)
+				exclusions
+
+	= count3 <# ALLOCATABLE_REGS_SSE
+
 
 -- Specification Code ----------------------------------------------------------
 --
diff --git a/compiler/nativeGen/RegClass.hs b/compiler/nativeGen/RegClass.hs
index 4bb300f..2a4ac33 100644
--- a/compiler/nativeGen/RegClass.hs
+++ b/compiler/nativeGen/RegClass.hs
@@ -17,6 +17,7 @@ data RegClass
 	= RcInteger 
 	| RcFloat
 	| RcDouble
+	| RcDoubleSSE -- x86 only: the SSE regs are a separate class
 	deriving Eq
 
 
@@ -24,8 +25,10 @@ instance Uniquable RegClass where
     getUnique RcInteger	= mkRegClassUnique 0
     getUnique RcFloat	= mkRegClassUnique 1
     getUnique RcDouble	= mkRegClassUnique 2
+    getUnique RcDoubleSSE = mkRegClassUnique 3
 
 instance Outputable RegClass where
     ppr RcInteger	= Outputable.text "I"
     ppr RcFloat		= Outputable.text "F"
     ppr RcDouble	= Outputable.text "D"
+    ppr RcDoubleSSE	= Outputable.text "S"
diff --git a/compiler/nativeGen/SPARC/Instr.hs b/compiler/nativeGen/SPARC/Instr.hs
index 87b6abc..00b57f9 100644
--- a/compiler/nativeGen/SPARC/Instr.hs
+++ b/compiler/nativeGen/SPARC/Instr.hs
@@ -373,6 +373,7 @@ sparc_mkSpillInstr reg _ slot
 			RcInteger -> II32
 			RcFloat   -> FF32
 			RcDouble  -> FF64
+			_         -> panic "sparc_mkSpillInstr"
 		
     in ST sz reg (fpRel (negate off_w))
 
@@ -391,6 +392,7 @@ sparc_mkLoadInstr reg _ slot
 			RcInteger -> II32
 			RcFloat   -> FF32
 			RcDouble  -> FF64
+			_         -> panic "sparc_mkLoadInstr"
 
         in LD sz (fpRel (- off_w)) reg
 
@@ -438,6 +440,7 @@ sparc_mkRegRegMoveInstr src dst
 		RcInteger -> ADD  False False src (RIReg g0) dst
 		RcDouble  -> FMOV FF64 src dst
 		RcFloat   -> FMOV FF32 src dst
+                _         -> panic "sparc_mkRegRegMoveInstr"
 	
 	| otherwise
 	= panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"
diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs
index d517a08..cb11d36 100644
--- a/compiler/nativeGen/SPARC/Ppr.hs
+++ b/compiler/nativeGen/SPARC/Ppr.hs
@@ -156,6 +156,7 @@ pprReg reg
 		VirtualRegHi u	-> text "%vHi_" <> asmSDoc (pprUnique u)
 		VirtualRegF  u	-> text "%vF_"  <> asmSDoc (pprUnique u)
 		VirtualRegD  u	-> text "%vD_"  <> asmSDoc (pprUnique u)
+                VirtualRegSSE u -> text "%vSSE_" <> asmSDoc (pprUnique u)
 
 	RegReal rr
 	 -> case rr of
diff --git a/compiler/nativeGen/SPARC/Regs.hs b/compiler/nativeGen/SPARC/Regs.hs
index 8ad400f..cd19138 100644
--- a/compiler/nativeGen/SPARC/Regs.hs
+++ b/compiler/nativeGen/SPARC/Regs.hs
@@ -95,22 +95,21 @@ virtualRegSqueeze cls vr
 	 -> case vr of
 	 	VirtualRegI{}		-> _ILIT(1)
 		VirtualRegHi{}		-> _ILIT(1)
-		VirtualRegF{}		-> _ILIT(0)
-		VirtualRegD{}		-> _ILIT(0)
+                _other                  -> _ILIT(0)
 
 	RcFloat
 	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
 		VirtualRegF{}		-> _ILIT(1)
 		VirtualRegD{}		-> _ILIT(2)
+                _other                  -> _ILIT(0)
 
 	RcDouble
 	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
 		VirtualRegF{}		-> _ILIT(1)
 		VirtualRegD{}		-> _ILIT(1)
+                _other                  -> _ILIT(0)
+
+        _other -> _ILIT(0)
 
 {-# INLINE realRegSqueeze #-}
 realRegSqueeze :: RegClass -> RealReg -> FastInt
@@ -141,6 +140,7 @@ realRegSqueeze cls rr
 			
 		RealRegPair{}		-> _ILIT(1)
 	 		 	 	
+        _other -> _ILIT(0)
  	
 -- | All the allocatable registers in the machine, 
 --	including register pairs.
@@ -283,7 +283,7 @@ regDotColor reg
  = case classOfRealReg reg of
  	RcInteger	-> text "blue"
 	RcFloat		-> text "red"
-	RcDouble	-> text "green"
+	_other          -> text "green"
 
 
 
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 5941a8c..e9bbc06 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -71,6 +71,22 @@ import Data.Bits
 import Data.Word
 import Data.Int
 
+sse2Enabled :: NatM Bool
+#if x86_64_TARGET_ARCH
+-- SSE2 is fixed on for x86_64.  It would be possible to make it optional,
+-- but we'd need to fix at least the foreign call code where the calling
+-- convention specifies the use of xmm regs, and possibly other places.
+sse2Enabled = return True
+#else
+sse2Enabled = do
+  dflags <- getDynFlagsNat
+  return (dopt Opt_SSE2 dflags)
+#endif
+
+if_sse2 :: NatM a -> NatM a -> NatM a
+if_sse2 sse2 x87 = do
+  b <- sse2Enabled
+  if b then sse2 else x87
 
 cmmTopCodeGen 
 	:: DynFlags
@@ -201,12 +217,15 @@ swizzleRegisterRep (Any _ codefn)     size = Any   size codefn
 
 
 -- | Grab the Reg for a CmmReg
-getRegisterReg :: CmmReg -> Reg
+getRegisterReg :: Bool -> CmmReg -> Reg
 
-getRegisterReg (CmmLocal (LocalReg u pk))
-  = RegVirtual $ mkVirtualReg u (cmmTypeSize pk)
+getRegisterReg use_sse2 (CmmLocal (LocalReg u pk))
+  = let sz = cmmTypeSize pk in
+    if isFloatSize sz && not use_sse2
+       then RegVirtual (mkVirtualReg u FF80)
+       else RegVirtual (mkVirtualReg u sz)
 
-getRegisterReg (CmmGlobal mid)
+getRegisterReg _ (CmmGlobal mid)
   = case get_GlobalReg_reg_or_addr mid of
        Left reg -> RegReal $ reg
        _other -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
@@ -405,8 +424,14 @@ getRegister (CmmReg (CmmGlobal PicBaseReg))
 #endif
 
 getRegister (CmmReg reg) 
-  = return (Fixed (cmmTypeSize (cmmRegType reg)) 
-		  (getRegisterReg reg) nilOL)
+  = do use_sse2 <- sse2Enabled
+       let
+         sz = cmmTypeSize (cmmRegType reg)
+         size | not use_sse2 && isFloatSize sz = FF80
+              | otherwise                      = sz
+       --
+       return (Fixed sz (getRegisterReg use_sse2 reg) nilOL)
+  
 
 getRegister tree@(CmmRegOff _ _) 
   = getRegister (mangleIndexTree tree)
@@ -437,78 +462,35 @@ getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
 #endif
 
 
-
-
-#if i386_TARGET_ARCH
-
-getRegister (CmmLit (CmmFloat f W32)) = do
-    lbl <- getNewLabelNat
-    dflags <- getDynFlagsNat
-    dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
-    Amode addr addr_code <- getAmode dynRef
-    let code dst =
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat f W32)]
-	    `consOL` (addr_code `snocOL`
-	    GLD FF32 addr dst)
-    -- in
-    return (Any FF32 code)
-
-
-getRegister (CmmLit (CmmFloat d W64))
-  | d == 0.0
-  = let code dst = unitOL (GLDZ dst)
-    in  return (Any FF64 code)
-
-  | d == 1.0
-  = let code dst = unitOL (GLD1 dst)
-    in  return (Any FF64 code)
-
-  | otherwise = do
-    lbl <- getNewLabelNat
-    dflags <- getDynFlagsNat
-    dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
-    Amode addr addr_code <- getAmode dynRef
-    let code dst =
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat d W64)]
-	    `consOL` (addr_code `snocOL`
-	    GLD FF64 addr dst)
-    -- in
-    return (Any FF64 code)
-
-#endif /* i386_TARGET_ARCH */
-
-
-
-
-#if x86_64_TARGET_ARCH
-getRegister (CmmLit (CmmFloat 0.0 w)) = do
-   let size = floatSize w
-       code dst = unitOL  (XOR size (OpReg dst) (OpReg dst))
-	-- I don't know why there are xorpd, xorps, and pxor instructions.
-	-- They all appear to do the same thing --SDM
-   return (Any size code)
-
-getRegister (CmmLit (CmmFloat f w)) = do
-    lbl <- getNewLabelNat
-    let code dst = toOL [
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat f w)],
-	    MOV size (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-	    ]
-    -- in
-    return (Any size code)
-  where size = floatSize w
-
-#endif /* x86_64_TARGET_ARCH */
-
-
-
-
+getRegister (CmmLit lit@(CmmFloat f w)) =
+  if_sse2 float_const_sse2 float_const_x87
+ where
+  float_const_sse2
+    | f == 0.0 = do
+      let
+          size = floatSize w
+          code dst = unitOL  (XOR size (OpReg dst) (OpReg dst))
+   	-- I don't know why there are xorpd, xorps, and pxor instructions.
+   	-- They all appear to do the same thing --SDM
+      return (Any size code)
+
+   | otherwise = do
+      Amode addr code <- memConstant (widthInBytes w) lit
+      loadFloatAmode True w addr code
+
+  float_const_x87 = case w of
+    W64
+      | f == 0.0 ->
+        let code dst = unitOL (GLDZ dst)
+        in  return (Any FF80 code)
+    
+      | f == 1.0 ->
+        let code dst = unitOL (GLD1 dst)
+        in  return (Any FF80 code)
+    
+    _otherwise -> do
+      Amode addr code <- memConstant (widthInBytes w) lit
+      loadFloatAmode False w addr code
 
 -- catch simple cases of zero- or sign-extended load
 getRegister (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
@@ -560,61 +542,20 @@ getRegister (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
     = return $ Any II64 (\dst -> unitOL $
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
-getRegister (CmmMachOp (MO_F_Neg W32) [x]) = do
-  x_code <- getAnyReg x
-  lbl <- getNewLabelNat
-  let
-    code dst = x_code dst `appOL` toOL [
-	-- This is how gcc does it, so it can't be that bad:
-	LDATA ReadOnlyData16 [
-		CmmAlign 16,
-		CmmDataLabel lbl,
-		CmmStaticLit (CmmInt 0x80000000 W32),
-		CmmStaticLit (CmmInt 0 W32),
-		CmmStaticLit (CmmInt 0 W32),
-		CmmStaticLit (CmmInt 0 W32)
-	],
-	XOR FF32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-		-- xorps, so we need the 128-bit constant
-		-- ToDo: rip-relative
-	]
-  --
-  return (Any FF32 code)
-
-getRegister (CmmMachOp (MO_F_Neg W64) [x]) = do
-  x_code <- getAnyReg x
-  lbl <- getNewLabelNat
-  let
-	-- This is how gcc does it, so it can't be that bad:
-    code dst = x_code dst `appOL` toOL [
-	LDATA ReadOnlyData16 [
-		CmmAlign 16,
-		CmmDataLabel lbl,
-		CmmStaticLit (CmmInt 0x8000000000000000 W64),
-		CmmStaticLit (CmmInt 0 W64)
-	],
-		-- gcc puts an unpck here.  Wonder if we need it.
-	XOR FF64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-		-- xorpd, so we need the 128-bit constant
-	]
-  --
-  return (Any FF64 code)
-
 #endif /* x86_64_TARGET_ARCH */
 
 
 
 
 
-getRegister (CmmMachOp mop [x]) -- unary MachOps
-  = case mop of
-#if i386_TARGET_ARCH
-      MO_F_Neg W32 -> trivialUFCode FF32 (GNEG FF32) x
-      MO_F_Neg W64 -> trivialUFCode FF64 (GNEG FF64) x
-#endif
+getRegister (CmmMachOp mop [x]) = do -- unary MachOps
+    sse2 <- sse2Enabled
+    case mop of
+      MO_F_Neg w
+         | sse2      -> sse2NegCode w x
+         | otherwise -> trivialUFCode FF80 (GNEG FF80) x
 
       MO_S_Neg w -> triv_ucode NEGI (intSize w)
-      MO_F_Neg w -> triv_ucode NEGI (floatSize w)
       MO_Not w   -> triv_ucode NOT  (intSize w)
 
       -- Nop conversions
@@ -659,13 +600,13 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
 	-- the form of a movzl and print it as a movl later.
 #endif
 
-#if i386_TARGET_ARCH
-      MO_FF_Conv W32 W64 -> conversionNop FF64 x
-      MO_FF_Conv W64 W32 -> conversionNop FF32 x
-#else
-      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
-      MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
-#endif
+      MO_FF_Conv W32 W64
+        | sse2      -> coerceFP2FP W64 x
+        | otherwise -> conversionNop FF80 x 
+
+      MO_FF_Conv W64 W32
+        | sse2      -> coerceFP2FP W32 x
+        | otherwise -> conversionNop FF80 x 
 
       MO_FS_Conv from to -> coerceFP2Int from to x
       MO_SF_Conv from to -> coerceInt2FP from to x
@@ -707,8 +648,9 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
                  return (swizzleRegisterRep e_code new_size)
 
 
-getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
-  = case mop of
+getRegister e@(CmmMachOp mop [x, y]) = do -- dyadic MachOps
+  sse2 <- sse2Enabled
+  case mop of
       MO_F_Eq w -> condFltReg EQQ x y
       MO_F_Ne w -> condFltReg NE x y
       MO_F_Gt w -> condFltReg GTT x y
@@ -729,19 +671,14 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
       MO_U_Lt rep -> condIntReg LU  x y
       MO_U_Le rep -> condIntReg LEU x y
 
-#if i386_TARGET_ARCH
-      MO_F_Add w -> trivialFCode w GADD x y
-      MO_F_Sub w -> trivialFCode w GSUB x y
-      MO_F_Quot w -> trivialFCode w GDIV x y
-      MO_F_Mul w -> trivialFCode w GMUL x y
-#endif
-
-#if x86_64_TARGET_ARCH
-      MO_F_Add w -> trivialFCode w ADD x y
-      MO_F_Sub w -> trivialFCode w SUB x y
-      MO_F_Quot w -> trivialFCode w FDIV x y
-      MO_F_Mul w -> trivialFCode w MUL x y
-#endif
+      MO_F_Add w  | sse2      -> trivialFCode_sse2 w ADD  x y
+                  | otherwise -> trivialFCode_x87  w GADD x y
+      MO_F_Sub w  | sse2      -> trivialFCode_sse2 w SUB  x y
+                  | otherwise -> trivialFCode_x87  w GSUB x y
+      MO_F_Quot w | sse2      -> trivialFCode_sse2 w FDIV x y
+                  | otherwise -> trivialFCode_x87  w GDIV x y
+      MO_F_Mul w  | sse2      -> trivialFCode_sse2 w MUL x y
+                  | otherwise -> trivialFCode_x87  w GMUL x y
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -892,13 +829,9 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
 getRegister (CmmLoad mem pk)
   | isFloatType pk
   = do
-    Amode src mem_code <- getAmode mem
-    let
-	size = cmmTypeSize pk
-    	code dst = mem_code `snocOL` 
-		   IF_ARCH_i386(GLD size src dst,
-			        MOV size (OpAddr src) (OpReg dst))
-    return (Any size code)
+    Amode addr mem_code <- getAmode mem
+    use_sse2 <- sse2Enabled
+    loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
 
 #if i386_TARGET_ARCH
 getRegister (CmmLoad mem pk)
@@ -1032,11 +965,8 @@ getNonClobberedReg expr = do
 
 reg2reg :: Size -> Reg -> Reg -> Instr
 reg2reg size src dst 
-#if i386_TARGET_ARCH
-  | isFloatSize size = GMOV src dst
-#endif
-  | otherwise	     = MOV size (OpReg src) (OpReg dst)
-
+  | size == FF80 = GMOV src dst
+  | otherwise	 = MOV size (OpReg src) (OpReg dst)
 
 
 --------------------------------------------------------------------------------
@@ -1122,30 +1052,41 @@ x86_complex_amode base index shift offset
 -- (see trivialCode where this function is used for an example).
 
 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-#if x86_64_TARGET_ARCH
-getNonClobberedOperand (CmmLit lit)
-  | isSuitableFloatingPointLit lit = do
-    lbl <- getNewLabelNat
-    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
-					   CmmStaticLit lit])
-    return (OpAddr (ripRel (ImmCLbl lbl)), code)
-#endif
-getNonClobberedOperand (CmmLit lit)
-  | is32BitLit lit && not (isFloatType (cmmLitType lit)) =
-    return (OpImm (litToImm lit), nilOL)
-getNonClobberedOperand (CmmLoad mem pk) 
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    (src',save_code) <- 
-	if (amodeCouldBeClobbered src) 
-		then do
-		   tmp <- getNewRegNat archWordSize
-		   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
-			   unitOL (LEA II32 (OpAddr src) (OpReg tmp)))
-		else
-		   return (src, nilOL)
-    return (OpAddr src', save_code `appOL` mem_code)
-getNonClobberedOperand e = do
+getNonClobberedOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if use_sse2 && isSuitableFloatingPointLit lit
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (widthInBytes w) lit
+      return (OpAddr addr, code)
+     else do
+
+  if is32BitLit lit && not (isFloatType (cmmLitType lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getNonClobberedOperand_generic (CmmLit lit)
+
+getNonClobberedOperand (CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2)
+      && IF_ARCH_i386(not (isWord64 pk), True)
+    then do
+      Amode src mem_code <- getAmode mem
+      (src',save_code) <- 
+  	if (amodeCouldBeClobbered src) 
+  		then do
+  		   tmp <- getNewRegNat archWordSize
+  		   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
+  			   unitOL (LEA II32 (OpAddr src) (OpReg tmp)))
+  		else
+  		   return (src, nilOL)
+      return (OpAddr src', save_code `appOL` mem_code)
+    else do
+      getNonClobberedOperand_generic (CmmLoad mem pk)
+
+getNonClobberedOperand e = getNonClobberedOperand_generic e
+
+getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand_generic e = do
     (reg, code) <- getNonClobberedReg e
     return (OpReg reg, code)
 
@@ -1158,22 +1099,32 @@ regClobbered _ = False
 -- getOperand: the operand is not required to remain valid across the
 -- computation of an arbitrary expression.
 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-#if x86_64_TARGET_ARCH
-getOperand (CmmLit lit)
-  | isSuitableFloatingPointLit lit = do
-    lbl <- getNewLabelNat
-    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
-					   CmmStaticLit lit])
-    return (OpAddr (ripRel (ImmCLbl lbl)), code)
-#endif
-getOperand (CmmLit lit)
-  | is32BitLit lit && not (isFloatType (cmmLitType lit)) = do
-    return (OpImm (litToImm lit), nilOL)
-getOperand (CmmLoad mem pk)
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    return (OpAddr src, mem_code)
-getOperand e = do
+
+getOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if (use_sse2 && isSuitableFloatingPointLit lit)
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (widthInBytes w) lit
+      return (OpAddr addr, code)
+    else do
+
+  if is32BitLit lit && not (isFloatType (cmmLitType lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getOperand_generic (CmmLit lit)
+
+getOperand (CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else
+       getOperand_generic (CmmLoad mem pk)
+
+getOperand e = getOperand_generic e
+
+getOperand_generic e = do
     (reg, code) <- getSomeReg e
     return (OpReg reg, code)
 
@@ -1183,6 +1134,38 @@ isOperand (CmmLit lit)  = is32BitLit lit
 			  || isSuitableFloatingPointLit lit
 isOperand _             = False
 
+memConstant :: Int -> CmmLit -> NatM Amode
+memConstant align lit = do
+#ifdef x86_64_TARGET_ARCH
+  lbl <- getNewLabelNat
+  let addr = ripRel (ImmCLbl lbl)
+      addr_code = nilOL
+#else
+  lbl <- getNewLabelNat
+  dflags <- getDynFlagsNat
+  dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
+  Amode addr addr_code <- getAmode dynRef
+#endif
+  let code =
+        LDATA ReadOnlyData
+  		[CmmAlign align,
+                 CmmDataLabel lbl,
+  		 CmmStaticLit lit]
+        `consOL` addr_code
+  return (Amode addr code)
+
+
+loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode use_sse2 w addr addr_code = do
+  let size = floatSize w
+      code dst = addr_code `snocOL`
+                 if use_sse2
+                    then MOV size (OpAddr addr) (OpReg dst)
+                    else GLD size addr dst
+  -- in
+  return (Any (if use_sse2 then size else FF80) code)
+
+
 -- if we want a floating-point literal as an operand, we can
 -- use it directly from memory.  However, if the literal is
 -- zero, we're better off generating it into a register using
@@ -1191,10 +1174,15 @@ isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
 isSuitableFloatingPointLit _ = False
 
 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
-getRegOrMem (CmmLoad mem pk)
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    return (OpAddr src, mem_code)
+getRegOrMem e@(CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else do
+       (reg, code) <- getNonClobberedReg e
+       return (OpReg reg, code)
 getRegOrMem e = do
     (reg, code) <- getNonClobberedReg e
     return (OpReg reg, code)
@@ -1314,40 +1302,36 @@ condIntCode cond x y = do
 --------------------------------------------------------------------------------
 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 
-#if i386_TARGET_ARCH
 condFltCode cond x y 
-  = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
-  (x_reg, x_code) <- getNonClobberedReg x
-  (y_reg, y_code) <- getSomeReg y
-  let
-	code = x_code `appOL` y_code `snocOL`
-		GCMP cond x_reg y_reg
-  -- The GCMP insn does the test and sets the zero flag if comparable
-  -- and true.  Hence we always supply EQQ as the condition to test.
-  return (CondCode True EQQ code)
-
-#elif x86_64_TARGET_ARCH
--- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
--- an operand, but the right must be a reg.  We can probably do better
--- than this general case...
-condFltCode cond x y = do
-  (x_reg, x_code) <- getNonClobberedReg x
-  (y_op, y_code) <- getOperand y
-  let
-	code = x_code `appOL`
-	       y_code `snocOL`
-    	    	  CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg)
-	-- NB(1): we need to use the unsigned comparison operators on the
-	-- result of this comparison.
-  -- in
-  return (CondCode True (condToUnsigned cond) code)
-
-#else
-condFltCode 	= panic "X86.condFltCode: not defined"
-
-#endif
-
+  = if_sse2 condFltCode_sse2 condFltCode_x87
+  where
 
+  condFltCode_x87
+    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_reg, y_code) <- getSomeReg y
+    use_sse2 <- sse2Enabled
+    let
+  	code = x_code `appOL` y_code `snocOL`
+  		GCMP cond x_reg y_reg
+    -- The GCMP insn does the test and sets the zero flag if comparable
+    -- and true.  Hence we always supply EQQ as the condition to test.
+    return (CondCode True EQQ code)
+  
+  -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+  -- an operand, but the right must be a reg.  We can probably do better
+  -- than this general case...
+  condFltCode_sse2 = do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op, y_code) <- getOperand y
+    let
+  	code = x_code `appOL`
+  	       y_code `snocOL`
+      	    	  CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg)
+  	-- NB(1): we need to use the unsigned comparison operators on the
+  	-- result of this comparison.
+    -- in
+    return (CondCode True (condToUnsigned cond) code)
 
 -- -----------------------------------------------------------------------------
 -- Generating assignments
@@ -1413,29 +1397,31 @@ assignMem_IntCode pk addr src = do
 -- Assign; dst is a reg, rhs is mem
 assignReg_IntCode pk reg (CmmLoad src _) = do
   load_code <- intLoadCode (MOV pk) src
-  return (load_code (getRegisterReg reg))
+  return (load_code (getRegisterReg False{-no sse2-} reg))
 
 -- dst is a reg, but src could be anything
 assignReg_IntCode pk reg src = do
   code <- getAnyReg src
-  return (code (getRegisterReg reg))
+  return (code (getRegisterReg False{-no sse2-} reg))
 
 
 -- Floating point assignment to memory
 assignMem_FltCode pk addr src = do
   (src_reg, src_code) <- getNonClobberedReg src
   Amode addr addr_code <- getAmode addr
+  use_sse2 <- sse2Enabled
   let
 	code = src_code `appOL`
 	       addr_code `snocOL`
-                IF_ARCH_i386(GST pk src_reg addr,
-		             MOV pk (OpReg src_reg) (OpAddr addr))
+                if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
+                            else GST pk src_reg addr
   return code
 
 -- Floating point assignment to a register/temporary
 assignReg_FltCode pk reg src = do
+  use_sse2 <- sse2Enabled
   src_code <- getAnyReg src
-  return (src_code (getRegisterReg reg))
+  return (src_code (getRegisterReg use_sse2 reg))
 
 
 genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
@@ -1477,15 +1463,10 @@ genCondJump
     -> CmmExpr      -- the condition on which to branch
     -> NatM InstrBlock
 
-#if i386_TARGET_ARCH
-genCondJump id bool = do
-  CondCode _ cond code <- getCondCode bool
-  return (code `snocOL` JXX cond id)
-
-#elif x86_64_TARGET_ARCH
 genCondJump id bool = do
   CondCode is_float cond cond_code <- getCondCode bool
-  if not is_float
+  use_sse2 <- sse2Enabled
+  if not is_float || not use_sse2
     then
 	return (cond_code `snocOL` JXX cond id)
     else do
@@ -1513,13 +1494,6 @@ genCondJump id bool = do
 		]
 	return (cond_code `appOL` code)
 
-#else
-genCondJump	= panic "X86.genCondJump: not defined"
-
-#endif
-
-
-
 
 -- -----------------------------------------------------------------------------
 --  Generating C calls
@@ -1549,7 +1523,11 @@ genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
 genCCall (CmmPrim op) [CmmHinted r _] args = do
   l1 <- getNewLabelNat
   l2 <- getNewLabelNat
-  case op of
+  sse2 <- sse2Enabled
+  if sse2
+    then
+      outOfLineFloatOp op r args
+    else case op of
 	MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
 	MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
 	
@@ -1563,11 +1541,12 @@ genCCall (CmmPrim op) [CmmHinted r _] args = do
 	MO_F64_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
 	
 	other_op    -> outOfLineFloatOp op r args
+
  where
   actuallyInlineFloatOp instr size [CmmHinted x _]
 	= do res <- trivialUFCode size (instr size) x
 	     any <- anyReg res
- 	     return (any (getRegisterReg (CmmLocal r)))
+ 	     return (any (getRegisterReg False (CmmLocal r)))
 
 genCCall target dest_regs args = do
     let
@@ -1582,7 +1561,8 @@ genCCall target dest_regs args = do
     setDeltaNat (delta0 - arg_pad_size)
 #endif
 
-    push_codes <- mapM push_arg (reverse args)
+    use_sse2 <- sse2Enabled
+    push_codes <- mapM (push_arg use_sse2) (reverse args)
     delta <- getDeltaNat
 
     -- in
@@ -1624,15 +1604,26 @@ genCCall target dest_regs args = do
 	-- assign the results, if necessary
 	assign_code []     = nilOL
 	assign_code [CmmHinted dest _hint]
-	  | isFloatType ty = unitOL (GMOV fake0 r_dest)
+	  | isFloatType ty = 
+             if use_sse2
+                then let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                                                   EAIndexNone
+                                                   (ImmInt 0)
+                         sz = floatSize w
+                     in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
+                               GST sz fake0 tmp_amode,
+                               MOV sz (OpAddr tmp_amode) (OpReg r_dest),
+                               ADD II32 (OpImm (ImmInt b)) (OpReg esp)]
+                else unitOL (GMOV fake0 r_dest)
 	  | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
 				    MOV II32 (OpReg edx) (OpReg r_dest_hi)]
 	  | otherwise      = unitOL (MOV (intSize w) (OpReg eax) (OpReg r_dest))
 	  where 
 		ty = localRegType dest
 		w  = typeWidth ty
+                b  = widthInBytes w
 		r_dest_hi = getHiVRegFromLo r_dest
-		r_dest    = getRegisterReg (CmmLocal dest)
+		r_dest    = getRegisterReg use_sse2 (CmmLocal dest)
 	assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
 
     return (push_code `appOL` 
@@ -1647,10 +1638,10 @@ genCCall target dest_regs args = do
                 | otherwise = x + a - (x `mod` a)
 
 
-    push_arg :: HintedCmmActual {-current argument-}
+    push_arg :: Bool -> HintedCmmActual {-current argument-}
                     -> NatM InstrBlock  -- code
 
-    push_arg (CmmHinted arg _hint) -- we don't need the hints on x86
+    push_arg use_sse2 (CmmHinted arg _hint) -- we don't need the hints on x86
       | isWord64 arg_ty = do
         ChildCode64 code r_lo <- iselExpr64 arg
         delta <- getDeltaNat
@@ -1673,10 +1664,15 @@ genCCall target dest_regs args = do
            then return (code `appOL`
                         toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
                               DELTA (delta-size),
-                              GST (floatSize (typeWidth arg_ty))
-				  reg (AddrBaseIndex (EABaseReg esp) 
+                              let addr = AddrBaseIndex (EABaseReg esp) 
                                                         EAIndexNone
-                                                        (ImmInt 0))]
+                                                        (ImmInt 0)
+                                  size = floatSize (typeWidth arg_ty)
+                              in
+                              if use_sse2 
+                                 then MOV size (OpReg reg) (OpAddr addr)
+                                 else GST size reg addr
+                             ]
                        )
            else return (code `snocOL`
                         PUSH II32 (OpReg reg) `snocOL`
@@ -1753,13 +1749,13 @@ genCCall target dest_regs args = do
 		 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
 
     let
-	-- The x86_64 ABI requires us to set %al to the number of SSE
+	-- The x86_64 ABI requires us to set %al to the number of SSE2
 	-- registers that contain arguments, if the called routine
 	-- is a varargs function.  We don't know whether it's a
 	-- varargs function or not, so we have to assume it is.
 	--
 	-- It's not safe to omit this assignment, even if the number
-	-- of SSE regs in use is zero.  If %al is larger than 8
+	-- of SSE2 regs in use is zero.  If %al is larger than 8
 	-- on entry to a varargs function, seg faults ensue.
 	assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
 
@@ -1785,7 +1781,7 @@ genCCall target dest_regs args = do
 		_ -> unitOL (MOV (cmmTypeSize rep) (OpReg rax) (OpReg r_dest))
 	  where 
 		rep = localRegType dest
-		r_dest = getRegisterReg (CmmLocal dest)
+		r_dest = getRegisterReg True (CmmLocal dest)
 	assign_code many = panic "genCCall.assign_code many"
 
     return (load_args_code 	`appOL` 
@@ -1870,17 +1866,7 @@ outOfLineFloatOp mop res args
       targetExpr <- cmmMakeDynamicReference dflags addImportNat CallReference lbl
       let target = CmmCallee targetExpr CCallConv
      
-      if isFloat64 (localRegType res)
-        then
-          stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn)
-        else do
-          uq <- getUniqueNat
-          let 
-            tmp = LocalReg uq f64
-          -- in
-          code1 <- stmtToInstrs (CmmCall target [CmmHinted tmp NoHint] args CmmUnsafe CmmMayReturn)
-          code2 <- stmtToInstrs (CmmAssign (CmmLocal res) (CmmReg (CmmLocal tmp)))
-          return (code1 `appOL` code2)
+      stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn)
   where
 	-- Assume we can call these functions directly, and that they're not in a dynamic library.
 	-- TODO: Why is this ok? Under linux this code will be in libm.so
@@ -2027,72 +2013,64 @@ condIntReg cond x y = do
 
 
 condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-condFltReg cond x y = do
-  CondCode _ cond cond_code <- condFltCode cond x y
-  tmp <- getNewRegNat II8
-  let 
-	code dst = cond_code `appOL` toOL [
-		    SETCC cond (OpReg tmp),
-		    MOVZxL II8 (OpReg tmp) (OpReg dst)
-		  ]
-  -- in
-  return (Any II32 code)
-
-#elif x86_64_TARGET_ARCH
-condFltReg cond x y = do
-  CondCode _ cond cond_code <- condFltCode cond x y
-  tmp1 <- getNewRegNat archWordSize
-  tmp2 <- getNewRegNat archWordSize
-  let 
-	-- We have to worry about unordered operands (eg. comparisons
-	-- against NaN).  If the operands are unordered, the comparison
-	-- sets the parity flag, carry flag and zero flag.
-	-- All comparisons are supposed to return false for unordered
-	-- operands except for !=, which returns true.
-	--
-	-- Optimisation: we don't have to test the parity flag if we
-	-- know the test has already excluded the unordered case: eg >
-	-- and >= test for a zero carry flag, which can only occur for
-	-- ordered operands.
-	--
-	-- ToDo: by reversing comparisons we could avoid testing the
-	-- parity flag in more cases.
-
-	code dst = 
-	   cond_code `appOL` 
-	     (case cond of
-	  	NE  -> or_unordered dst
-		GU  -> plain_test   dst
-		GEU -> plain_test   dst
-		_   -> and_ordered  dst)
-
-	plain_test dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    MOVZxL II8 (OpReg tmp1) (OpReg dst)
-		 ]
-	or_unordered dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    SETCC PARITY (OpReg tmp2),
-		    OR II8 (OpReg tmp1) (OpReg tmp2),
-		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
-		  ]
-	and_ordered dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    SETCC NOTPARITY (OpReg tmp2),
-		    AND II8 (OpReg tmp1) (OpReg tmp2),
-		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
-		  ]
-  -- in
-  return (Any II32 code)
-
-#else
-condFltReg	= panic "X86.condFltReg: not defined"
-
-#endif
-
-
+condFltReg cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
+ where
+  condFltReg_x87 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp <- getNewRegNat II8
+    let 
+  	code dst = cond_code `appOL` toOL [
+  		    SETCC cond (OpReg tmp),
+  		    MOVZxL II8 (OpReg tmp) (OpReg dst)
+  		  ]
+    -- in
+    return (Any II32 code)
+  
+  condFltReg_sse2 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp1 <- getNewRegNat archWordSize
+    tmp2 <- getNewRegNat archWordSize
+    let 
+  	-- We have to worry about unordered operands (eg. comparisons
+  	-- against NaN).  If the operands are unordered, the comparison
+  	-- sets the parity flag, carry flag and zero flag.
+  	-- All comparisons are supposed to return false for unordered
+  	-- operands except for !=, which returns true.
+  	--
+  	-- Optimisation: we don't have to test the parity flag if we
+  	-- know the test has already excluded the unordered case: eg >
+  	-- and >= test for a zero carry flag, which can only occur for
+  	-- ordered operands.
+  	--
+  	-- ToDo: by reversing comparisons we could avoid testing the
+  	-- parity flag in more cases.
+  
+  	code dst = 
+  	   cond_code `appOL` 
+  	     (case cond of
+  	  	NE  -> or_unordered dst
+  		GU  -> plain_test   dst
+  		GEU -> plain_test   dst
+  		_   -> and_ordered  dst)
+  
+  	plain_test dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    MOVZxL II8 (OpReg tmp1) (OpReg dst)
+  		 ]
+  	or_unordered dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    SETCC PARITY (OpReg tmp2),
+  		    OR II8 (OpReg tmp1) (OpReg tmp2),
+  		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+  		  ]
+  	and_ordered dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    SETCC NOTPARITY (OpReg tmp2),
+  		    AND II8 (OpReg tmp1) (OpReg tmp2),
+  		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+  		  ]
+    -- in
+    return (Any II32 code)
 
 
 -- -----------------------------------------------------------------------------
@@ -2207,26 +2185,21 @@ trivialUCode rep instr x = do
 
 -----------
 
-#if i386_TARGET_ARCH
-
-trivialFCode width instr x y = do
+trivialFCode_x87 width instr x y = do
   (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
   (y_reg, y_code) <- getSomeReg y
   let
-     size = floatSize width
+     size = FF80 -- always, on x87
      code dst =
 	x_code `appOL`
 	y_code `snocOL`
 	instr size x_reg y_reg dst
   return (Any size code)
 
-#endif
+trivialFCode_sse2 pk instr x y
+    = genTrivialCode size (instr size) x y
+    where size = floatSize pk
 
-#if x86_64_TARGET_ARCH
-trivialFCode pk instr x y 
-  = genTrivialCode size (instr size) x y
-  where size = floatSize pk
-#endif
 
 trivialUFCode size instr x = do
   (x_reg, x_code) <- getSomeReg x
@@ -2240,67 +2213,50 @@ trivialUFCode size instr x = do
 
 --------------------------------------------------------------------------------
 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-coerceInt2FP from to x = do
-  (x_reg, x_code) <- getSomeReg x
-  let
-        opc  = case to of W32 -> GITOF; W64 -> GITOD
-        code dst = x_code `snocOL` opc x_reg dst
-	-- ToDo: works for non-II32 reps?
-  return (Any (floatSize to) code)
-
-#elif x86_64_TARGET_ARCH
-coerceInt2FP from to x = do
-  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
-  let
-        opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
-        code dst = x_code `snocOL` opc x_op dst
-  -- in
-  return (Any (floatSize to) code) -- works even if the destination rep is <II32
-
-#else
-coerceInt2FP	= panic "X86.coerceInt2FP: not defined"
-
-#endif
-
-
-
+coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
+ where
+   coerce_x87 = do
+     (x_reg, x_code) <- getSomeReg x
+     let
+           opc  = case to of W32 -> GITOF; W64 -> GITOD
+           code dst = x_code `snocOL` opc x_reg dst
+   	-- ToDo: works for non-II32 reps?
+     return (Any FF80 code)
+   
+   coerce_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
+           code dst = x_code `snocOL` opc (intSize from) x_op dst
+     -- in
+     return (Any (floatSize to) code)
+        -- works even if the destination rep is <II32
 
 --------------------------------------------------------------------------------
 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-coerceFP2Int from to x = do
-  (x_reg, x_code) <- getSomeReg x
-  let
-        opc  = case from of W32 -> GFTOI; W64 -> GDTOI
-        code dst = x_code `snocOL` opc x_reg dst
-	-- ToDo: works for non-II32 reps?
-  -- in
-  return (Any (intSize to) code)
-
-#elif x86_64_TARGET_ARCH
-coerceFP2Int from to x = do
-  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
-  let
-        opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ
-        code dst = x_code `snocOL` opc x_op dst
-  -- in
-  return (Any (intSize to) code) -- works even if the destination rep is <II32
-
-#else
-coerceFP2Int	= panic "X86.coerceFP2Int: not defined"
-
-#endif
-
-
+coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
+ where
+   coerceFP2Int_x87 = do
+     (x_reg, x_code) <- getSomeReg x
+     let
+           opc  = case from of W32 -> GFTOI; W64 -> GDTOI
+           code dst = x_code `snocOL` opc x_reg dst
+   	-- ToDo: works for non-II32 reps?
+     -- in
+     return (Any (intSize to) code)
+   
+   coerceFP2Int_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ
+           code dst = x_code `snocOL` opc (intSize to) x_op dst
+     -- in
+     return (Any (intSize to) code)
+         -- works even if the destination rep is <II32
 
 
 --------------------------------------------------------------------------------
 coerceFP2FP :: Width -> CmmExpr -> NatM Register
-
-#if x86_64_TARGET_ARCH
 coerceFP2FP to x = do
   (x_reg, x_code) <- getSomeReg x
   let
@@ -2309,10 +2265,22 @@ coerceFP2FP to x = do
   -- in
   return (Any (floatSize to) code)
 
-#else
-coerceFP2FP	= panic "X86.coerceFP2FP: not defined"
-
-#endif
-
-
+--------------------------------------------------------------------------------
 
+sse2NegCode :: Width -> CmmExpr -> NatM Register
+sse2NegCode w x = do
+  let sz = floatSize w
+  x_code <- getAnyReg x
+  -- This is how gcc does it, so it can't be that bad:
+  let
+    const | FF32 <- sz = CmmInt 0x80000000 W32
+          | otherwise  = CmmInt 0x8000000000000000 W64
+  Amode amode amode_code <- memConstant (widthInBytes w) const
+  tmp <- getNewRegNat sz
+  let
+    code dst = x_code dst `appOL` amode_code `appOL` toOL [
+        MOV sz (OpAddr amode) (OpReg tmp),
+	XOR sz (OpReg tmp) (OpReg dst)
+	]
+  --
+  return (Any sz code)
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 6dc229b..f856313 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -253,10 +253,10 @@ data Instr
 	-- use MOV for moving (either movss or movsd (movlpd better?))
 	| CVTSS2SD	Reg Reg		-- F32 to F64
 	| CVTSD2SS	Reg Reg		-- F64 to F32
-	| CVTTSS2SIQ	Operand Reg	-- F32 to I32/I64 (with truncation)
-	| CVTTSD2SIQ	Operand	Reg	-- F64 to I32/I64 (with truncation)
-	| CVTSI2SS	Operand Reg	-- I32/I64 to F32
-	| CVTSI2SD	Operand Reg	-- I32/I64 to F64
+	| CVTTSS2SIQ	Size Operand Reg -- F32 to I32/I64 (with truncation)
+	| CVTTSD2SIQ	Size Operand Reg -- F64 to I32/I64 (with truncation)
+	| CVTSI2SS	Size Operand Reg -- I32/I64 to F32
+	| CVTSI2SD	Size Operand Reg -- I32/I64 to F64
 
 	-- use ADD & SUB for arithmetic.  In both cases, operands
 	-- are  Operand Reg.
@@ -353,7 +353,6 @@ x86_regUsageOfInstr instr
     CLTD   _		-> mkRU [eax] [edx]
     NOP			-> mkRU [] []
 
-#if i386_TARGET_ARCH
     GMOV   src dst	-> mkRU [src] [dst]
     GLD    _ src dst	-> mkRU (use_EA src) [dst]
     GST    _ src dst	-> mkRUR (src : use_EA dst)
@@ -379,17 +378,14 @@ x86_regUsageOfInstr instr
     GSIN   _ _ _ src dst -> mkRU [src] [dst]
     GCOS   _ _ _ src dst -> mkRU [src] [dst]
     GTAN   _ _ _ src dst -> mkRU [src] [dst]
-#endif
 
-#if x86_64_TARGET_ARCH
     CVTSS2SD   src dst	-> mkRU [src] [dst]
     CVTSD2SS   src dst	-> mkRU [src] [dst]
-    CVTTSS2SIQ src dst	-> mkRU (use_R src) [dst]
-    CVTTSD2SIQ src dst	-> mkRU (use_R src) [dst]
-    CVTSI2SS   src dst	-> mkRU (use_R src) [dst]
-    CVTSI2SD   src dst	-> mkRU (use_R src) [dst]
+    CVTTSS2SIQ _ src dst -> mkRU (use_R src) [dst]
+    CVTTSD2SIQ _ src dst -> mkRU (use_R src) [dst]
+    CVTSI2SS   _ src dst -> mkRU (use_R src) [dst]
+    CVTSI2SD   _ src dst -> mkRU (use_R src) [dst]
     FDIV _     src dst	-> usageRM src dst
-#endif    
 
     FETCHGOT reg        -> mkRU [] [reg]
     FETCHPC  reg        -> mkRU [] [reg]
@@ -483,7 +479,6 @@ x86_patchRegsOfInstr instr env
     JMP op		-> patch1 JMP op
     JMP_TBL op ids      -> patch1 JMP_TBL op $ ids
 
-#if i386_TARGET_ARCH
     GMOV src dst	-> GMOV (env src) (env dst)
     GLD  sz src dst	-> GLD sz (lookupAddr src) (env dst)
     GST  sz src dst	-> GST sz (env src) (lookupAddr dst)
@@ -509,17 +504,14 @@ x86_patchRegsOfInstr instr env
     GSIN sz l1 l2 src dst	-> GSIN sz l1 l2 (env src) (env dst)
     GCOS sz l1 l2 src dst	-> GCOS sz l1 l2 (env src) (env dst)
     GTAN sz l1 l2 src dst	-> GTAN sz l1 l2 (env src) (env dst)
-#endif
 
-#if x86_64_TARGET_ARCH
     CVTSS2SD src dst	-> CVTSS2SD (env src) (env dst)
     CVTSD2SS src dst	-> CVTSD2SS (env src) (env dst)
-    CVTTSS2SIQ src dst	-> CVTTSS2SIQ (patchOp src) (env dst)
-    CVTTSD2SIQ src dst	-> CVTTSD2SIQ (patchOp src) (env dst)
-    CVTSI2SS src dst	-> CVTSI2SS (patchOp src) (env dst)
-    CVTSI2SD src dst	-> CVTSI2SD (patchOp src) (env dst)
+    CVTTSS2SIQ sz src dst -> CVTTSS2SIQ sz (patchOp src) (env dst)
+    CVTTSD2SIQ sz src dst -> CVTTSD2SIQ sz (patchOp src) (env dst)
+    CVTSI2SS sz src dst	-> CVTSI2SS sz (patchOp src) (env dst)
+    CVTSI2SD sz src dst	-> CVTSI2SD sz (patchOp src) (env dst)
     FDIV sz src dst	-> FDIV sz (patchOp src) (patchOp dst)
-#endif    
 
     CALL (Left _)  _	-> instr
     CALL (Right reg) p	-> CALL (Right (env reg)) p
@@ -602,30 +594,16 @@ x86_mkSpillInstr
 	-> Int		-- spill slot to use
 	-> Instr
 
-#if   i386_TARGET_ARCH
-x86_mkSpillInstr reg delta slot
-  = let	off     = spillSlotToOffset slot
-    in
-    let off_w = (off-delta) `div` 4
-    in case targetClassOfReg reg of
-	   RcInteger -> MOV II32 (OpReg reg) (OpAddr (spRel off_w))
-	   _         -> GST FF80 reg (spRel off_w) {- RcFloat/RcDouble -}
-
-#elif x86_64_TARGET_ARCH
 x86_mkSpillInstr reg delta slot
   = let	off     = spillSlotToOffset slot
     in
-    let off_w = (off-delta) `div` 8
+    let off_w = (off-delta) `div` IF_ARCH_i386(4,8)
     in case targetClassOfReg reg of
-	   RcInteger -> MOV II64 (OpReg reg) (OpAddr (spRel off_w))
-	   RcDouble  -> MOV FF64 (OpReg reg) (OpAddr (spRel off_w))
+	   RcInteger   -> MOV IF_ARCH_i386(II32,II64)
+                              (OpReg reg) (OpAddr (spRel off_w))
+	   RcDouble    -> GST FF80 reg (spRel off_w) {- RcFloat/RcDouble -}
+	   RcDoubleSSE -> MOV FF64 (OpReg reg) (OpAddr (spRel off_w))
            _         -> panic "X86.mkSpillInstr: no match"
-		-- ToDo: will it work to always spill as a double?
-		-- does that cause a stall if the data was a float?
-#else
-x86_mkSpillInstr _ _ _
-    =   panic "X86.RegInfo.mkSpillInstr: not defined for this architecture."
-#endif
 
 
 -- | Make a spill reload instruction.
@@ -635,26 +613,16 @@ x86_mkLoadInstr
 	-> Int		-- spill slot to use
 	-> Instr
 
-#if   i386_TARGET_ARCH
-x86_mkLoadInstr reg delta slot
-  = let off     = spillSlotToOffset slot
-    in
-	let off_w = (off-delta) `div` 4
-        in case targetClassOfReg reg of {
-              RcInteger -> MOV II32 (OpAddr (spRel off_w)) (OpReg reg);
-              _         -> GLD FF80 (spRel off_w) reg} {- RcFloat/RcDouble -}
-#elif x86_64_TARGET_ARCH
 x86_mkLoadInstr reg delta slot
   = let off     = spillSlotToOffset slot
     in
-	let off_w = (off-delta) `div` 8
+	let off_w = (off-delta) `div` IF_ARCH_i386(4,8)
         in case targetClassOfReg reg of
-              RcInteger -> MOV II64 (OpAddr (spRel off_w)) (OpReg reg)
-              _         -> MOV FF64 (OpAddr (spRel off_w)) (OpReg reg)
-#else
-x86_mkLoadInstr _ _ _
- 	= panic "X86.RegInfo.mkLoadInstr: not defined for this architecture."
-#endif
+              RcInteger -> MOV IF_ARCH_i386(II32,II64) 
+                               (OpAddr (spRel off_w)) (OpReg reg)
+              RcDouble  -> GLD FF80 (spRel off_w) reg {- RcFloat/RcDouble -}
+              RcDoubleSSE -> MOV FF64 (OpAddr (spRel off_w)) (OpReg reg)
+              _           -> panic "X86.x86_mkLoadInstr"
 
 spillSlotSize :: Int
 spillSlotSize = IF_ARCH_i386(12, 8)
@@ -715,14 +683,12 @@ x86_mkRegRegMoveInstr src dst
  = case targetClassOfReg src of
 #if   i386_TARGET_ARCH
         RcInteger -> MOV II32 (OpReg src) (OpReg dst)
-        RcDouble  -> GMOV src dst
-	RcFloat   -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
 #else
         RcInteger -> MOV II64 (OpReg src) (OpReg dst)
-        RcDouble  -> MOV FF64 (OpReg src) (OpReg dst)
-	RcFloat   -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
 #endif
-
+        RcDouble    -> GMOV src dst
+        RcDoubleSSE -> MOV FF64 (OpReg src) (OpReg dst)
+	_     -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
 
 -- | Check whether an instruction represents a reg-reg move.
 -- 	The register allocator attempts to eliminate reg->reg moves whenever it can,
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 89bbb5d..fe94f21 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -181,6 +181,7 @@ pprReg s r
       RegVirtual (VirtualRegHi u)  -> text "%vHi_" <> asmSDoc (pprUnique u)
       RegVirtual (VirtualRegF  u)  -> text "%vF_" <> asmSDoc (pprUnique u)
       RegVirtual (VirtualRegD  u)  -> text "%vD_" <> asmSDoc (pprUnique u)
+      RegVirtual (VirtualRegSSE  u) -> text "%vSSE_" <> asmSDoc (pprUnique u)
   where
 #if i386_TARGET_ARCH
     ppr_reg_no :: Size -> Int -> Doc
@@ -210,10 +211,7 @@ pprReg s r
 	 2 -> sLit "%ecx";    3 -> sLit "%edx";
 	 4 -> sLit "%esi";    5 -> sLit "%edi";
 	 6 -> sLit "%ebp";    7 -> sLit "%esp";
-	 8 -> sLit "%fake0";  9 -> sLit "%fake1";
-	10 -> sLit "%fake2"; 11 -> sLit "%fake3";
-	12 -> sLit "%fake4"; 13 -> sLit "%fake5";
-	_  -> sLit "very naughty I386 register"
+         _  -> ppr_reg_float i
       })
 #elif x86_64_TARGET_ARCH
     ppr_reg_no :: Size -> Int -> Doc
@@ -271,20 +269,26 @@ pprReg s r
 	10 -> sLit "%r10";    11 -> sLit "%r11";
 	12 -> sLit "%r12";    13 -> sLit "%r13";
 	14 -> sLit "%r14";    15 -> sLit "%r15";
-	16 -> sLit "%xmm0";   17 -> sLit "%xmm1";
-	18 -> sLit "%xmm2";   19 -> sLit "%xmm3";
-	20 -> sLit "%xmm4";   21 -> sLit "%xmm5";
-	22 -> sLit "%xmm6";   23 -> sLit "%xmm7";
-	24 -> sLit "%xmm8";   25 -> sLit "%xmm9";
-	26 -> sLit "%xmm10";  27 -> sLit "%xmm11";
-	28 -> sLit "%xmm12";  29 -> sLit "%xmm13";
-	30 -> sLit "%xmm14";  31 -> sLit "%xmm15";
-	_  -> sLit "very naughty x86_64 register"
+        _  -> ppr_reg_float i
       })
 #else
      ppr_reg_no _ = panic "X86.Ppr.ppr_reg_no: no match"
 #endif
 
+ppr_reg_float :: Int -> LitString
+ppr_reg_float i = case i of
+	16 -> sLit "%fake0";  17 -> sLit "%fake1"
+	18 -> sLit "%fake2";  19 -> sLit "%fake3"
+	20 -> sLit "%fake4";  21 -> sLit "%fake5"
+	24 -> sLit "%xmm0";   25 -> sLit "%xmm1"
+	26 -> sLit "%xmm2";   27 -> sLit "%xmm3"
+	28 -> sLit "%xmm4";   29 -> sLit "%xmm5"
+	30 -> sLit "%xmm6";   31 -> sLit "%xmm7"
+	32 -> sLit "%xmm8";   33 -> sLit "%xmm9"
+	34 -> sLit "%xmm10";  35 -> sLit "%xmm11"
+	36 -> sLit "%xmm12";  37 -> sLit "%xmm13"
+	38 -> sLit "%xmm14";  39 -> sLit "%xmm15"
+	_  -> sLit "very naughty x86 register"
 
 pprSize :: Size -> Doc
 pprSize x 
@@ -293,19 +297,19 @@ pprSize x
 		II16  -> sLit "w"
 		II32  -> sLit "l"
 		II64  -> sLit "q"
-#if i386_TARGET_ARCH
-		FF32  -> sLit "s"
-		FF64  -> sLit "l"
-		FF80  -> sLit "t"
-#elif x86_64_TARGET_ARCH
 		FF32  -> sLit "ss"	-- "scalar single-precision float" (SSE2)
 		FF64  -> sLit "sd"	-- "scalar double-precision float" (SSE2)
-                _     -> panic "X86.Ppr.pprSize: no match"
-#else
-		_     -> panic "X86.Ppr.pprSize: no match"
-#endif
+		FF80  -> sLit "t"
 	        )
 
+pprSize_x87 :: Size -> Doc
+pprSize_x87 x
+  = ptext $ case x of
+		FF32  -> sLit "s"
+		FF64  -> sLit "l"
+		FF80  -> sLit "t"
+                _     -> panic "X86.Ppr.pprSize_x87"
+
 pprCond :: Cond -> Doc
 pprCond c
  = ptext (case c of {
@@ -636,12 +640,12 @@ pprInstr (MUL size op1 op2) = pprSizeOpOp (sLit "mul") size op1 op2
 
 pprInstr (FDIV size op1 op2) = pprSizeOpOp (sLit "div") size op1 op2
 
-pprInstr (CVTSS2SD from to)   = pprRegReg (sLit "cvtss2sd") from to
-pprInstr (CVTSD2SS from to)   = pprRegReg (sLit "cvtsd2ss") from to
-pprInstr (CVTTSS2SIQ from to) = pprOpReg  (sLit "cvttss2siq") from to
-pprInstr (CVTTSD2SIQ from to) = pprOpReg  (sLit "cvttsd2siq") from to
-pprInstr (CVTSI2SS from to)   = pprOpReg  (sLit "cvtsi2ssq") from to
-pprInstr (CVTSI2SD from to)   = pprOpReg  (sLit "cvtsi2sdq") from to
+pprInstr (CVTSS2SD from to)      = pprRegReg (sLit "cvtss2sd") from to
+pprInstr (CVTSD2SS from to)      = pprRegReg (sLit "cvtsd2ss") from to
+pprInstr (CVTTSS2SIQ sz from to) = pprSizeOpReg (sLit "cvttss2si") sz from to
+pprInstr (CVTTSD2SIQ sz from to) = pprSizeOpReg (sLit "cvttsd2si") sz from to
+pprInstr (CVTSI2SS sz from to)   = pprSizeOpReg (sLit "cvtsi2ss") sz from to
+pprInstr (CVTSI2SD sz from to)   = pprSizeOpReg (sLit "cvtsi2sd") sz from to
 
     -- FETCHGOT for PIC on ELF platforms
 pprInstr (FETCHGOT reg)
@@ -673,20 +677,24 @@ pprInstr g@(GMOV src dst)
    | otherwise 
    = pprG g (hcat [gtab, gpush src 0, gsemi, gpop dst 1])
 
--- GLD sz addr dst ==> FFREE %st(7) ; FLDsz addr ; FSTP (dst+1)
+-- GLD sz addr dst ==> FLDsz addr ; FSTP (dst+1)
 pprInstr g@(GLD sz addr dst)
- = pprG g (hcat [gtab, text "ffree %st(7) ; fld", pprSize sz, gsp, 
+ = pprG g (hcat [gtab, text "fld", pprSize_x87 sz, gsp, 
                  pprAddr addr, gsemi, gpop dst 1])
 
--- GST sz src addr ==> FFREE %st(7) ; FLD dst ; FSTPsz addr
+-- GST sz src addr ==> FLD dst ; FSTPsz addr
 pprInstr g@(GST sz src addr)
+ | src == fake0 && sz /= FF80 -- fstt instruction doesn't exist
+ = pprG g (hcat [gtab, 
+                 text "fst", pprSize_x87 sz, gsp, pprAddr addr])
+ | otherwise
  = pprG g (hcat [gtab, gpush src 0, gsemi, 
-                 text "fstp", pprSize sz, gsp, pprAddr addr])
+                 text "fstp", pprSize_x87 sz, gsp, pprAddr addr])
 
 pprInstr g@(GLDZ dst)
- = pprG g (hcat [gtab, text "ffree %st(7) ; fldz ; ", gpop dst 1])
+ = pprG g (hcat [gtab, text "fldz ; ", gpop dst 1])
 pprInstr g@(GLD1 dst)
- = pprG g (hcat [gtab, text "ffree %st(7) ; fld1 ; ", gpop dst 1])
+ = pprG g (hcat [gtab, text "fld1 ; ", gpop dst 1])
 
 pprInstr (GFTOI src dst) 
    = pprInstr (GDTOI src dst)
@@ -710,7 +718,7 @@ pprInstr (GITOF src dst)
 
 pprInstr g@(GITOD src dst) 
    = pprG g (hcat [gtab, text "pushl ", pprReg II32 src, 
-                   text " ; ffree %st(7); fildl (%esp) ; ",
+                   text " ; fildl (%esp) ; ",
                    gpop dst 1, text " ; addl $4,%esp"])
 
 {- Gruesome swamp follows.  If you're unfortunate enough to have ventured
@@ -868,7 +876,7 @@ pprInstr g@(GDIV _ src1 src2 dst)
 
 pprInstr GFREE 
    = vcat [ ptext (sLit "\tffree %st(0) ;ffree %st(1) ;ffree %st(2) ;ffree %st(3)"),
-            ptext (sLit "\tffree %st(4) ;ffree %st(5) ;ffree %st(6) ;ffree %st(7)") 
+            ptext (sLit "\tffree %st(4) ;ffree %st(5)") 
           ]
 
 pprInstr _
@@ -927,15 +935,14 @@ gcoerceto _    = panic "X86.Ppr.gcoerceto: no match"
 
 gpush :: Reg -> RegNo -> Doc
 gpush reg offset
-   = hcat [text "ffree %st(7) ; fld ", greg reg offset]
-
+   = hcat [text "fld ", greg reg offset]
 
 gpop :: Reg -> RegNo -> Doc
 gpop reg offset
    = hcat [text "fstp ", greg reg offset]
 
 greg :: Reg -> RegNo -> Doc
-greg reg offset = text "%st(" <> int (gregno reg - 8+offset) <> char ')'
+greg reg offset = text "%st(" <> int (gregno reg - 16+offset) <> char ')'
 
 gsemi :: Doc
 gsemi = text " ; "
@@ -1072,11 +1079,11 @@ pprRegReg name reg1 reg2
     ]
 
 
-pprOpReg :: LitString -> Operand -> Reg -> Doc
-pprOpReg name op1 reg2
+pprSizeOpReg :: LitString -> Size -> Operand -> Reg -> Doc
+pprSizeOpReg name size op1 reg2
   = hcat [
-	pprMnemonic_ name,
-	pprOperand archWordSize op1,
+	pprMnemonic name size,
+	pprOperand size op1,
         comma,
         pprReg archWordSize reg2
     ]
diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs
index ed420a4..eb8e82c 100644
--- a/compiler/nativeGen/X86/RegInfo.hs
+++ b/compiler/nativeGen/X86/RegInfo.hs
@@ -23,12 +23,11 @@ import X86.Regs
 
 mkVirtualReg :: Unique -> Size -> VirtualReg
 mkVirtualReg u size
-   | not (isFloatSize size) = VirtualRegI u
-   | otherwise
    = case size of
-        FF32	-> VirtualRegD u
-        FF64	-> VirtualRegD u
-	_	-> panic "mkVirtualReg"
+        FF32	-> VirtualRegSSE u
+        FF64	-> VirtualRegSSE u
+        FF80	-> VirtualRegD   u
+        _other  -> VirtualRegI   u
 
 
 -- reg colors for x86
@@ -44,15 +43,8 @@ regColors
  $  	[ (eax,	"#00ff00")
 	, (ebx,	"#0000ff")
 	, (ecx,	"#00ffff")
-	, (edx,	"#0080ff")
-
-	, (fake0, "#ff00ff")
-	, (fake1, "#ff00aa")
-	, (fake2, "#aa00ff")
-	, (fake3, "#aa00aa")
-	, (fake4, "#ff0055")
-	, (fake5, "#5500ff") ]
-
+	, (edx,	"#0080ff") ]
+        ++ fpRegColors
 
 -- reg colors for x86_64
 #elif x86_64_TARGET_ARCH
@@ -76,9 +68,19 @@ regColors
 	, (r13, "#004080")
 	, (r14, "#004040")
 	, (r15, "#002080") ]
-
-	++ zip (map regSingle [16..31]) (repeat "red")
+	++ fpRegColors
 #else
 regDotColor :: Reg -> SDoc
 regDotColor	= panic "not defined"
 #endif
+
+fpRegColors :: [(Reg,String)]
+fpRegColors =
+        [ (fake0, "#ff00ff")
+	, (fake1, "#ff00aa")
+	, (fake2, "#aa00ff")
+	, (fake3, "#aa00aa")
+	, (fake4, "#ff0055")
+	, (fake5, "#5500ff") ]
+
+	++ zip (map regSingle [24..39]) (repeat "red")
diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs
index 64d835b..bed9dc5 100644
--- a/compiler/nativeGen/X86/Regs.hs
+++ b/compiler/nativeGen/X86/Regs.hs
@@ -49,11 +49,6 @@ where
 #include "nativeGen/NCG.h"
 #include "HsVersions.h"
 
-#if i386_TARGET_ARCH
-# define STOLEN_X86_REGS 4
--- HACK: go for the max
-#endif
-
 #include "../includes/stg/MachRegs.h"
 
 import Reg
@@ -88,58 +83,23 @@ virtualRegSqueeze cls vr
 	 -> case vr of
 	 	VirtualRegI{}		-> _ILIT(1)
 		VirtualRegHi{}		-> _ILIT(1)
-		VirtualRegD{}		-> _ILIT(0)
-		VirtualRegF{}		-> _ILIT(0)
-
-	-- We don't use floats on this arch, but we can't
-	--	return error because the return type is unboxed...
-	RcFloat
-	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
-		VirtualRegD{}		-> _ILIT(0)
-		VirtualRegF{}		-> _ILIT(0)
+                _other                  -> _ILIT(0)
 
 	RcDouble
 	 -> case vr of
-	 	VirtualRegI{}		-> _ILIT(0)
-		VirtualRegHi{}		-> _ILIT(0)
 		VirtualRegD{}		-> _ILIT(1)
 		VirtualRegF{}		-> _ILIT(0)
+                _other                  -> _ILIT(0)
 
-{-# INLINE realRegSqueeze #-}
-realRegSqueeze :: RegClass -> RealReg -> FastInt
-
-#if defined(i386_TARGET_ARCH)
-realRegSqueeze cls rr
- = case cls of
- 	RcInteger
-	 -> case rr of
-	 	RealRegSingle regNo
-			| regNo	< 8	-> _ILIT(1)	-- first fp reg is 8
-			| otherwise	-> _ILIT(0)
-			
-		RealRegPair{}		-> _ILIT(0)
-
-	-- We don't use floats on this arch, but we can't
-	--	return error because the return type is unboxed...
-	RcFloat
-	 -> case rr of
-	 	RealRegSingle regNo
-			| regNo	< 8	-> _ILIT(0)
-			| otherwise	-> _ILIT(0)
-			
-		RealRegPair{}		-> _ILIT(0)
+	RcDoubleSSE
+	 -> case vr of
+		VirtualRegSSE{}		-> _ILIT(1)
+                _other                  -> _ILIT(0)
 
-	RcDouble
-	 -> case rr of
-	 	RealRegSingle regNo
-			| regNo	< 8	-> _ILIT(0)
-			| otherwise	-> _ILIT(1)
-			
-		RealRegPair{}		-> _ILIT(0)
+        _other -> _ILIT(0)
 
-#elif defined(x86_64_TARGET_ARCH)
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> FastInt
 realRegSqueeze cls rr
  = case cls of
  	RcInteger
@@ -150,29 +110,20 @@ realRegSqueeze cls rr
 			
 		RealRegPair{}		-> _ILIT(0)
 
-	-- We don't use floats on this arch, but we can't
-	--	return error because the return type is unboxed...
-	RcFloat
+	RcDouble
 	 -> case rr of
 	 	RealRegSingle regNo
-			| regNo	< 16	-> _ILIT(0)
+			| regNo >= 16 && regNo < 24 -> _ILIT(1)
 			| otherwise	-> _ILIT(0)
 			
 		RealRegPair{}		-> _ILIT(0)
 
-	RcDouble
+        RcDoubleSSE
 	 -> case rr of
-	 	RealRegSingle regNo
-			| regNo	< 16	-> _ILIT(0)
-			| otherwise	-> _ILIT(1)
-			
-		RealRegPair{}		-> _ILIT(0)
-
-#else
-realRegSqueeze _ _	= _ILIT(0)
-#endif
-
+	 	RealRegSingle regNo | regNo >= 24 -> _ILIT(1)
+                _otherwise                        -> _ILIT(0)
 
+        _other -> _ILIT(0)
 
 -- -----------------------------------------------------------------------------
 -- Immediates
@@ -275,87 +226,48 @@ spRel _	= panic "X86.Regs.spRel: not defined for this architecture"
 argRegs :: RegNo -> [Reg]
 argRegs _	= panic "MachRegs.argRegs(x86): should not be used!"
 
-
-
-
-
 -- | The complete set of machine registers.
 allMachRegNos :: [RegNo]
-
-#if   i386_TARGET_ARCH
-allMachRegNos	= [0..13]
-
-#elif x86_64_TARGET_ARCH
-allMachRegNos  = [0..31]
-
+#if i386_TARGET_ARCH
+allMachRegNos  = [0..7]  ++ floatregs -- not %r8..%r15
 #else
-allMachRegNos	= panic "X86.Regs.callClobberedRegs: not defined for this architecture"
-
+allMachRegNos  = [0..15] ++ floatregs
 #endif
-
+  where floatregs = fakes ++ xmms; fakes = [16..21]; xmms = [24..39]
 
 -- | Take the class of a register.
 {-# INLINE classOfRealReg      #-}
 classOfRealReg :: RealReg -> RegClass
-
-#if   i386_TARGET_ARCH
 -- On x86, we might want to have an 8-bit RegClass, which would
 -- contain just regs 1-4 (the others don't have 8-bit versions).
 -- However, we can get away without this at the moment because the
 -- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
 classOfRealReg reg
  = case reg of
-	RealRegSingle i	-> if i < 8 then RcInteger else RcDouble
-	RealRegPair{}	-> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
+	RealRegSingle i
+          | i < 16    -> RcInteger
+          | i < 24    -> RcDouble
+          | otherwise -> RcDoubleSSE
 
-#elif x86_64_TARGET_ARCH
--- On x86, we might want to have an 8-bit RegClass, which would
--- contain just regs 1-4 (the others don't have 8-bit versions).
--- However, we can get away without this at the moment because the
--- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
-classOfRealReg reg
- = case reg of
-	RealRegSingle i	-> if i < 16 then RcInteger else RcDouble
 	RealRegPair{}	-> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
 
-#else
-classOfRealReg _	= panic "X86.Regs.regClass: not defined for this architecture"
-
-#endif
-
-
 -- | Get the name of the register with this number.
 showReg :: RegNo -> String
-
-#if   i386_TARGET_ARCH
-showReg n
-   = if   n >= 0 && n < 14
-     then regNames !! n
-     else "%unknown_x86_real_reg_" ++ show n
-
-regNames :: [String]
-regNames 
-   = ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp", 
-      "%fake0", "%fake1", "%fake2", "%fake3", "%fake4", "%fake5", "%fake6"]
-
-#elif x86_64_TARGET_ARCH
 showReg n
-	| n >= 16	= "%xmm" ++ show (n-16)
+	| n >= 24	= "%xmm" ++ show (n-24)
+        | n >= 16       = "%fake" ++ show (n-16)
 	| n >= 8	= "%r" ++ show n
 	| otherwise	= regNames !! n
 
 regNames :: [String]
 regNames 
- = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ]
-
-#else
-showReg _	= panic "X86.Regs.showReg: not defined for this architecture"
-
+#if   i386_TARGET_ARCH
+   = ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
+#elif x86_64_TARGET_ARCH
+   = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ]
 #endif
 
 
-
-
 -- machine specific ------------------------------------------------------------
 
 
@@ -385,12 +297,12 @@ esi   = regSingle 4
 edi   = regSingle 5
 ebp   = regSingle 6
 esp   = regSingle 7
-fake0 = regSingle 8
-fake1 = regSingle 9
-fake2 = regSingle 10
-fake3 = regSingle 11
-fake4 = regSingle 12
-fake5 = regSingle 13
+fake0 = regSingle 16
+fake1 = regSingle 17
+fake2 = regSingle 18
+fake3 = regSingle 19
+fake4 = regSingle 20
+fake5 = regSingle 21
 
 
 
@@ -423,25 +335,25 @@ r12   = regSingle 12
 r13   = regSingle 13
 r14   = regSingle 14
 r15   = regSingle 15
-xmm0  = regSingle 16
-xmm1  = regSingle 17
-xmm2  = regSingle 18
-xmm3  = regSingle 19
-xmm4  = regSingle 20
-xmm5  = regSingle 21
-xmm6  = regSingle 22
-xmm7  = regSingle 23
-xmm8  = regSingle 24
-xmm9  = regSingle 25
-xmm10 = regSingle 26
-xmm11 = regSingle 27
-xmm12 = regSingle 28
-xmm13 = regSingle 29
-xmm14 = regSingle 30
-xmm15 = regSingle 31
+xmm0  = regSingle 24
+xmm1  = regSingle 25
+xmm2  = regSingle 26
+xmm3  = regSingle 27
+xmm4  = regSingle 28
+xmm5  = regSingle 29
+xmm6  = regSingle 30
+xmm7  = regSingle 31
+xmm8  = regSingle 32
+xmm9  = regSingle 33
+xmm10 = regSingle 34
+xmm11 = regSingle 35
+xmm12 = regSingle 36
+xmm13 = regSingle 37
+xmm14 = regSingle 38
+xmm15 = regSingle 39
 
 allFPArgRegs :: [Reg]
-allFPArgRegs	= map regSingle [16 .. 23]
+allFPArgRegs	= map regSingle [24 .. 31]
 
 ripRel :: Displacement -> AddrMode
 ripRel imm	= AddrBaseIndex EABaseRip EAIndexNone imm
@@ -460,7 +372,7 @@ esp = rsp
 -}
 
 xmm :: RegNo -> Reg
-xmm n = regSingle (16+n)
+xmm n = regSingle (24+n)
 
 
 
@@ -482,12 +394,6 @@ callClobberedRegs 	:: [Reg]
 #define edi 5
 #define ebp 6
 #define esp 7
-#define fake0 8
-#define fake1 9
-#define fake2 10
-#define fake3 11
-#define fake4 12
-#define fake5 13
 #endif
 
 #if x86_64_TARGET_ARCH
@@ -507,24 +413,31 @@ callClobberedRegs 	:: [Reg]
 #define r13   13
 #define r14   14
 #define r15   15
-#define xmm0  16
-#define xmm1  17
-#define xmm2  18
-#define xmm3  19
-#define xmm4  20
-#define xmm5  21
-#define xmm6  22
-#define xmm7  23
-#define xmm8  24
-#define xmm9  25
-#define xmm10 26
-#define xmm11 27
-#define xmm12 28
-#define xmm13 29
-#define xmm14 30
-#define xmm15 31
 #endif
 
+#define fake0 16
+#define fake1 17
+#define fake2 18
+#define fake3 19
+#define fake4 20
+#define fake5 21
+
+#define xmm0  24
+#define xmm1  25
+#define xmm2  26
+#define xmm3  27
+#define xmm4  28
+#define xmm5  29
+#define xmm6  30
+#define xmm7  31
+#define xmm8  32
+#define xmm9  33
+#define xmm10 34
+#define xmm11 35
+#define xmm12 36
+#define xmm13 37
+#define xmm14 38
+#define xmm15 39
 
 
 #if i386_TARGET_ARCH
@@ -697,13 +610,13 @@ allArgRegs  = panic "X86.Regs.allArgRegs: not defined for this architecture"
 #if   i386_TARGET_ARCH
 -- caller-saves registers
 callClobberedRegs
-  = map regSingle [eax,ecx,edx,fake0,fake1,fake2,fake3,fake4,fake5]
+  = map regSingle ([eax,ecx,edx]  ++ [16..39])
 
 #elif x86_64_TARGET_ARCH
 -- all xmm regs are caller-saves
 -- caller-saves registers
 callClobberedRegs    
-  = map regSingle ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..31])
+  = map regSingle ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..39])
 
 #else
 callClobberedRegs
diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml
index 5724850..6be97d7 100644
--- a/docs/users_guide/flags.xml
+++ b/docs/users_guide/flags.xml
@@ -2064,6 +2064,14 @@ phase <replaceable>n</replaceable></entry>
 	      <entry>Reverse</entry>
 	    </row>
 	  </thead>
+          <tbody>
+            <row>
+	      <entry><option>-msse2</option></entry>
+	      <entry>(x86 only) Use SSE2 for floating point</entry>
+	      <entry>dynamic</entry>
+	      <entry>-</entry>
+            </row>
+          </tbody>
 	  <tbody>
 	    <row>
 	      <entry><option>-monly-[432]-regs</option></entry>
diff --git a/docs/users_guide/using.xml b/docs/users_guide/using.xml
index eb6b0c0..329c31f 100644
--- a/docs/users_guide/using.xml
+++ b/docs/users_guide/using.xml
@@ -2074,9 +2074,27 @@ f "2"    = 2
     <variablelist>
 
       <varlistentry>
+	<term><option>-msse2</option>:</term>
+	<listitem>
+          <para>
+            (x86 only, added in GHC 6.14.1) Use the SSE2 registers and
+            instruction set to implement floating point operations
+            when using the native code generator.  This gives a
+            substantial performance improvement for floating point,
+            but the resulting compiled code will only run on
+            processors that support SSE2 (Intel Pentium 4 and later,
+            or AMD Athlon 64 and later).
+          </para>
+          <para>
+            SSE2 is unconditionally used on x86-64 platforms.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
 	<term><option>-monly-[32]-regs</option>:</term>
 	<listitem>
-	  <para>(iX86 machines)<indexterm><primary>-monly-N-regs
+	  <para>(x86 only)<indexterm><primary>-monly-N-regs
           option (iX86 only)</primary></indexterm> GHC tries to
           &ldquo;steal&rdquo; four registers from GCC, for performance
           reasons; it almost always works.  However, when GCC is
-- 
1.7.10.4