X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=compiler%2FnativeGen%2FX86%2FCodeGen.hs;h=39de19c412c124c4131eb3af010b15d5460c43c7;hp=43495a45a5e74cd2f08895a0f21f75885b82216a;hb=8133a9f47b99f4e65ed30551de32ad72c6b61b27;hpb=b04a210e26ca57242fd052f2aa91011a80b76299

diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 43495a4..39de19c 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1,10 +1,3 @@
-{-# OPTIONS -w #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and fix
--- any warnings in the module. See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
--- for details
-
 -----------------------------------------------------------------------------
 --
 -- Generating machine code (instruction selection)
@@ -20,6 +13,7 @@
 
 module X86.CodeGen ( 
 	cmmTopCodeGen, 
+	generateJumpTableForInstr,
 	InstrBlock 
 ) 
 
@@ -27,69 +21,83 @@ where
 
 #include "HsVersions.h"
 #include "nativeGen/NCG.h"
-#include "MachDeps.h"
+#include "../includes/MachDeps.h"
 
 -- NCG stuff:
 import X86.Instr
 import X86.Cond
 import X86.Regs
 import X86.RegInfo
-import X86.Ppr
 import Instruction
 import PIC
 import NCGMonad
 import Size
 import Reg
-import RegClass
 import Platform
 
 -- Our intermediate code:
 import BasicTypes
 import BlockId
-import PprCmm		( pprExpr )
-import Cmm
+import PprCmm		()
+import OldCmm
+import OldPprCmm        ()
 import CLabel
-import ClosureInfo	( C_SRT(..) )
 
 -- The rest:
 import StaticFlags	( opt_PIC )
 import ForeignCall	( CCallConv(..) )
 import OrdList
-import Pretty
-import qualified Outputable as O
 import Outputable
+import Unique
 import FastString
 import FastBool		( isFastTrue )
 import Constants	( wORD_SIZE )
 import DynFlags
 
-import Debug.Trace	( trace )
+import Control.Monad    ( mapAndUnzipM )
+import Data.Maybe       ( catMaybes )
+import Data.Int
 
-import Control.Monad	( mapAndUnzipM )
-import Data.Maybe	( fromJust )
-import Data.Bits
+#if WORD_SIZE_IN_BITS==32
+import Data.Maybe       ( fromJust )
 import Data.Word
-import Data.Int
+import Data.Bits
+#endif
 
+sse2Enabled :: NatM Bool
+#if x86_64_TARGET_ARCH
+-- SSE2 is fixed on for x86_64.  It would be possible to make it optional,
+-- but we'd need to fix at least the foreign call code where the calling
+-- convention specifies the use of xmm regs, and possibly other places.
+sse2Enabled = return True
+#else
+sse2Enabled = do
+  dflags <- getDynFlagsNat
+  return (dopt Opt_SSE2 dflags)
+#endif
+
+if_sse2 :: NatM a -> NatM a -> NatM a
+if_sse2 sse2 x87 = do
+  b <- sse2Enabled
+  if b then sse2 else x87
 
 cmmTopCodeGen 
-	:: DynFlags
-	-> RawCmmTop
+	:: RawCmmTop
 	-> NatM [NatCmmTop Instr]
 
-cmmTopCodeGen dynflags 
-	(CmmProc info lab params (ListGraph blocks)) = do
+cmmTopCodeGen (CmmProc info lab (ListGraph blocks)) = do
   (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
   picBaseMb <- getPicBaseMaybeNat
-  let proc = CmmProc info lab params (ListGraph $ concat nat_blocks)
+  dflags <- getDynFlagsNat
+  let proc = CmmProc info lab (ListGraph $ concat nat_blocks)
       tops = proc : concat statics
-      os   = platformOS $ targetPlatform dynflags
+      os   = platformOS $ targetPlatform dflags
 
   case picBaseMb of
       Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
       Nothing -> return tops
   
-cmmTopCodeGen _ (CmmData sec dat) = do
+cmmTopCodeGen (CmmData sec dat) = do
   return [CmmData sec dat]  -- no translation, we just use CmmStatic
 
 
@@ -152,8 +160,8 @@ stmtToInstrs stmt = case stmt of
     CmmBranch id	  -> genBranch id
     CmmCondBranch arg id  -> genCondJump id arg
     CmmSwitch arg ids     -> genSwitch arg ids
-    CmmJump arg params	  -> genJump arg
-    CmmReturn params	  ->
+    CmmJump arg _         -> genJump arg
+    CmmReturn _           ->
       panic "stmtToInstrs: return statement should have been cps'd away"
 
 
@@ -172,6 +180,7 @@ data CondCode
 	= CondCode Bool Cond InstrBlock
 
 
+#if WORD_SIZE_IN_BITS==32
 -- | a.k.a "Register64"
 --	Reg is the lower 32-bit temporary which contains the result. 
 --	Use getHiVRegFromLo to find the other VRegUnique.  
@@ -183,6 +192,7 @@ data ChildCode64
    = ChildCode64 
         InstrBlock
         Reg	 	
+#endif
 
 
 -- | Register's passed up the tree.  If the stix code forces the register
@@ -201,18 +211,21 @@ swizzleRegisterRep (Any _ codefn)     size = Any   size codefn
 
 
 -- | Grab the Reg for a CmmReg
-getRegisterReg :: CmmReg -> Reg
+getRegisterReg :: Bool -> CmmReg -> Reg
 
-getRegisterReg (CmmLocal (LocalReg u pk))
-  = mkVReg u (cmmTypeSize pk)
+getRegisterReg use_sse2 (CmmLocal (LocalReg u pk))
+  = let sz = cmmTypeSize pk in
+    if isFloatSize sz && not use_sse2
+       then RegVirtual (mkVirtualReg u FF80)
+       else RegVirtual (mkVirtualReg u sz)
 
-getRegisterReg (CmmGlobal mid)
-  = case get_GlobalReg_reg_or_addr mid of
-       Left (RealReg rrno) -> RealReg rrno
-       _other -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
-          -- By this stage, the only MagicIds remaining should be the
-          -- ones which map to a real machine register on this
-          -- platform.  Hence ...
+getRegisterReg _ (CmmGlobal mid)
+  = case globalRegMaybe mid of
+        Just reg -> RegReal $ reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence ...
 
 
 -- | Memory addressing modes passed up the tree.
@@ -252,8 +265,8 @@ is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
 -- | Convert a BlockId to some CmmStatic data
 jumpTableEntry :: Maybe BlockId -> CmmStatic
 jumpTableEntry Nothing = CmmStaticLit (CmmInt 0 wordWidth)
-jumpTableEntry (Just (BlockId id)) = CmmStaticLit (CmmLabel blockLabel)
-    where blockLabel = mkAsmTempLabel id
+jumpTableEntry (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+    where blockLabel = mkAsmTempLabel (getUnique blockid)
 
 
 -- -----------------------------------------------------------------------------
@@ -261,8 +274,8 @@ jumpTableEntry (Just (BlockId id)) = CmmStaticLit (CmmLabel blockLabel)
 
 -- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
 -- CmmExprs into CmmRegOff?
-mangleIndexTree :: CmmExpr -> CmmExpr
-mangleIndexTree (CmmRegOff reg off)
+mangleIndexTree :: CmmReg -> Int -> CmmExpr
+mangleIndexTree reg off
   = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
   where width = typeWidth (cmmRegType reg)
 
@@ -279,9 +292,7 @@ getSomeReg expr = do
 	return (reg, code)
 
 
-
-
-
+#if WORD_SIZE_IN_BITS==32
 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
 assignMem_I64Code addrTree valueTree = do
   Amode addr addr_code <- getAmode addrTree
@@ -297,10 +308,10 @@ assignMem_I64Code addrTree valueTree = do
 
 
 assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
-assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree = do
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
    ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
    let 
-         r_dst_lo = mkVReg u_dst II32
+         r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
          r_dst_hi = getHiVRegFromLo r_dst_lo
          r_src_hi = getHiVRegFromLo r_src_lo
          mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
@@ -310,12 +321,10 @@ assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree = do
         vcode `snocOL` mov_lo `snocOL` mov_hi
      )
 
-assignReg_I64Code lvalue valueTree
+assignReg_I64Code _ _
    = panic "assignReg_I64Code(i386): invalid lvalue"
 
 
-
-
 iselExpr64        :: CmmExpr -> NatM ChildCode64
 iselExpr64 (CmmLit (CmmInt i _)) = do
   (rlo,rhi) <- getNewRegPairNat II32
@@ -342,7 +351,7 @@ iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
      )
 
 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
-   = return (ChildCode64 nilOL (mkVReg vu II32))
+   = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
          
 -- we handle addition, but rather badly
 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
@@ -389,7 +398,7 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
 
 iselExpr64 expr
    = pprPanic "iselExpr64(i386)" (ppr expr)
-
+#endif
 
 
 --------------------------------------------------------------------------------
@@ -405,11 +414,17 @@ getRegister (CmmReg (CmmGlobal PicBaseReg))
 #endif
 
 getRegister (CmmReg reg) 
-  = return (Fixed (cmmTypeSize (cmmRegType reg)) 
-		  (getRegisterReg reg) nilOL)
+  = do use_sse2 <- sse2Enabled
+       let
+         sz = cmmTypeSize (cmmRegType reg)
+         size | not use_sse2 && isFloatSize sz = FF80
+              | otherwise                      = sz
+       --
+       return (Fixed size (getRegisterReg use_sse2 reg) nilOL)
+  
 
-getRegister tree@(CmmRegOff _ _) 
-  = getRegister (mangleIndexTree tree)
+getRegister (CmmRegOff r n) 
+  = getRegister $ mangleIndexTree r n
 
 
 #if WORD_SIZE_IN_BITS==32
@@ -437,78 +452,35 @@ getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
 #endif
 
 
-
-
-#if i386_TARGET_ARCH
-
-getRegister (CmmLit (CmmFloat f W32)) = do
-    lbl <- getNewLabelNat
-    dflags <- getDynFlagsNat
-    dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
-    Amode addr addr_code <- getAmode dynRef
-    let code dst =
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat f W32)]
-	    `consOL` (addr_code `snocOL`
-	    GLD FF32 addr dst)
-    -- in
-    return (Any FF32 code)
-
-
-getRegister (CmmLit (CmmFloat d W64))
-  | d == 0.0
-  = let code dst = unitOL (GLDZ dst)
-    in  return (Any FF64 code)
-
-  | d == 1.0
-  = let code dst = unitOL (GLD1 dst)
-    in  return (Any FF64 code)
-
-  | otherwise = do
-    lbl <- getNewLabelNat
-    dflags <- getDynFlagsNat
-    dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
-    Amode addr addr_code <- getAmode dynRef
-    let code dst =
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat d W64)]
-	    `consOL` (addr_code `snocOL`
-	    GLD FF64 addr dst)
-    -- in
-    return (Any FF64 code)
-
-#endif /* i386_TARGET_ARCH */
-
-
-
-
-#if x86_64_TARGET_ARCH
-getRegister (CmmLit (CmmFloat 0.0 w)) = do
-   let size = floatSize w
-       code dst = unitOL  (XOR size (OpReg dst) (OpReg dst))
-	-- I don't know why there are xorpd, xorps, and pxor instructions.
-	-- They all appear to do the same thing --SDM
-   return (Any size code)
-
-getRegister (CmmLit (CmmFloat f w)) = do
-    lbl <- getNewLabelNat
-    let code dst = toOL [
-	    LDATA ReadOnlyData
-			[CmmDataLabel lbl,
-			 CmmStaticLit (CmmFloat f w)],
-	    MOV size (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-	    ]
-    -- in
-    return (Any size code)
-  where size = floatSize w
-
-#endif /* x86_64_TARGET_ARCH */
-
-
-
-
+getRegister (CmmLit lit@(CmmFloat f w)) =
+  if_sse2 float_const_sse2 float_const_x87
+ where
+  float_const_sse2
+    | f == 0.0 = do
+      let
+          size = floatSize w
+          code dst = unitOL  (XOR size (OpReg dst) (OpReg dst))
+   	-- I don't know why there are xorpd, xorps, and pxor instructions.
+   	-- They all appear to do the same thing --SDM
+      return (Any size code)
+
+   | otherwise = do
+      Amode addr code <- memConstant (widthInBytes w) lit
+      loadFloatAmode True w addr code
+
+  float_const_x87 = case w of
+    W64
+      | f == 0.0 ->
+        let code dst = unitOL (GLDZ dst)
+        in  return (Any FF80 code)
+    
+      | f == 1.0 ->
+        let code dst = unitOL (GLD1 dst)
+        in  return (Any FF80 code)
+    
+    _otherwise -> do
+      Amode addr code <- memConstant (widthInBytes w) lit
+      loadFloatAmode False w addr code
 
 -- catch simple cases of zero- or sign-extended load
 getRegister (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
@@ -560,61 +532,20 @@ getRegister (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
     = return $ Any II64 (\dst -> unitOL $
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
-getRegister (CmmMachOp (MO_F_Neg W32) [x]) = do
-  x_code <- getAnyReg x
-  lbl <- getNewLabelNat
-  let
-    code dst = x_code dst `appOL` toOL [
-	-- This is how gcc does it, so it can't be that bad:
-	LDATA ReadOnlyData16 [
-		CmmAlign 16,
-		CmmDataLabel lbl,
-		CmmStaticLit (CmmInt 0x80000000 W32),
-		CmmStaticLit (CmmInt 0 W32),
-		CmmStaticLit (CmmInt 0 W32),
-		CmmStaticLit (CmmInt 0 W32)
-	],
-	XOR FF32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-		-- xorps, so we need the 128-bit constant
-		-- ToDo: rip-relative
-	]
-  --
-  return (Any FF32 code)
-
-getRegister (CmmMachOp (MO_F_Neg W64) [x]) = do
-  x_code <- getAnyReg x
-  lbl <- getNewLabelNat
-  let
-	-- This is how gcc does it, so it can't be that bad:
-    code dst = x_code dst `appOL` toOL [
-	LDATA ReadOnlyData16 [
-		CmmAlign 16,
-		CmmDataLabel lbl,
-		CmmStaticLit (CmmInt 0x8000000000000000 W64),
-		CmmStaticLit (CmmInt 0 W64)
-	],
-		-- gcc puts an unpck here.  Wonder if we need it.
-	XOR FF64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-		-- xorpd, so we need the 128-bit constant
-	]
-  --
-  return (Any FF64 code)
-
 #endif /* x86_64_TARGET_ARCH */
 
 
 
 
 
-getRegister (CmmMachOp mop [x]) -- unary MachOps
-  = case mop of
-#if i386_TARGET_ARCH
-      MO_F_Neg W32 -> trivialUFCode FF32 (GNEG FF32) x
-      MO_F_Neg W64 -> trivialUFCode FF64 (GNEG FF64) x
-#endif
+getRegister (CmmMachOp mop [x]) = do -- unary MachOps
+    sse2 <- sse2Enabled
+    case mop of
+      MO_F_Neg w
+         | sse2      -> sse2NegCode w x
+         | otherwise -> trivialUFCode FF80 (GNEG FF80) x
 
       MO_S_Neg w -> triv_ucode NEGI (intSize w)
-      MO_F_Neg w -> triv_ucode NEGI (floatSize w)
       MO_Not w   -> triv_ucode NOT  (intSize w)
 
       -- Nop conversions
@@ -659,18 +590,16 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
 	-- the form of a movzl and print it as a movl later.
 #endif
 
-#if i386_TARGET_ARCH
-      MO_FF_Conv W32 W64 -> conversionNop FF64 x
-      MO_FF_Conv W64 W32 -> conversionNop FF32 x
-#else
-      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
+      MO_FF_Conv W32 W64
+        | sse2      -> coerceFP2FP W64 x
+        | otherwise -> conversionNop FF80 x 
+
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
-#endif
 
       MO_FS_Conv from to -> coerceFP2Int from to x
       MO_SF_Conv from to -> coerceInt2FP from to x
 
-      other -> pprPanic "getRegister" (pprMachOp mop)
+      _other -> pprPanic "getRegister" (pprMachOp mop)
    where
 	triv_ucode :: (Size -> Operand -> Instr) -> Size -> NatM Register
 	triv_ucode instr size = trivialUCode size (instr size) x
@@ -707,41 +636,37 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
                  return (swizzleRegisterRep e_code new_size)
 
 
-getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
-  = case mop of
-      MO_F_Eq w -> condFltReg EQQ x y
-      MO_F_Ne w -> condFltReg NE x y
-      MO_F_Gt w -> condFltReg GTT x y
-      MO_F_Ge w -> condFltReg GE x y
-      MO_F_Lt w -> condFltReg LTT x y
-      MO_F_Le w -> condFltReg LE x y
-
-      MO_Eq rep   -> condIntReg EQQ x y
-      MO_Ne rep   -> condIntReg NE x y
-
-      MO_S_Gt rep -> condIntReg GTT x y
-      MO_S_Ge rep -> condIntReg GE x y
-      MO_S_Lt rep -> condIntReg LTT x y
-      MO_S_Le rep -> condIntReg LE x y
-
-      MO_U_Gt rep -> condIntReg GU  x y
-      MO_U_Ge rep -> condIntReg GEU x y
-      MO_U_Lt rep -> condIntReg LU  x y
-      MO_U_Le rep -> condIntReg LEU x y
-
-#if i386_TARGET_ARCH
-      MO_F_Add w -> trivialFCode w GADD x y
-      MO_F_Sub w -> trivialFCode w GSUB x y
-      MO_F_Quot w -> trivialFCode w GDIV x y
-      MO_F_Mul w -> trivialFCode w GMUL x y
-#endif
-
-#if x86_64_TARGET_ARCH
-      MO_F_Add w -> trivialFCode w ADD x y
-      MO_F_Sub w -> trivialFCode w SUB x y
-      MO_F_Quot w -> trivialFCode w FDIV x y
-      MO_F_Mul w -> trivialFCode w MUL x y
-#endif
+getRegister (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+  sse2 <- sse2Enabled
+  case mop of
+      MO_F_Eq _ -> condFltReg EQQ x y
+      MO_F_Ne _ -> condFltReg NE  x y
+      MO_F_Gt _ -> condFltReg GTT x y
+      MO_F_Ge _ -> condFltReg GE  x y
+      MO_F_Lt _ -> condFltReg LTT x y
+      MO_F_Le _ -> condFltReg LE  x y
+
+      MO_Eq _   -> condIntReg EQQ x y
+      MO_Ne _   -> condIntReg NE  x y
+
+      MO_S_Gt _ -> condIntReg GTT x y
+      MO_S_Ge _ -> condIntReg GE  x y
+      MO_S_Lt _ -> condIntReg LTT x y
+      MO_S_Le _ -> condIntReg LE  x y
+
+      MO_U_Gt _ -> condIntReg GU  x y
+      MO_U_Ge _ -> condIntReg GEU x y
+      MO_U_Lt _ -> condIntReg LU  x y
+      MO_U_Le _ -> condIntReg LEU x y
+
+      MO_F_Add w  | sse2      -> trivialFCode_sse2 w ADD  x y
+                  | otherwise -> trivialFCode_x87    GADD x y
+      MO_F_Sub w  | sse2      -> trivialFCode_sse2 w SUB  x y
+                  | otherwise -> trivialFCode_x87    GSUB x y
+      MO_F_Quot w | sse2      -> trivialFCode_sse2 w FDIV x y
+                  | otherwise -> trivialFCode_x87    GDIV x y
+      MO_F_Mul w  | sse2      -> trivialFCode_sse2 w MUL x y
+                  | otherwise -> trivialFCode_x87    GMUL x y
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -766,7 +691,7 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 
-      other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
+      _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
   where
     --------------------
     triv_op width instr = trivialCode width op (Just op) x y
@@ -803,7 +728,7 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
 	       -> NatM Register
 
     {- Case1: shift length as immediate -}
-    shift_code width instr x y@(CmmLit lit) = do
+    shift_code width instr x (CmmLit lit) = do
 	  x_code <- getAnyReg x
 	  let
 	       size = intSize width
@@ -892,13 +817,9 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
 getRegister (CmmLoad mem pk)
   | isFloatType pk
   = do
-    Amode src mem_code <- getAmode mem
-    let
-	size = cmmTypeSize pk
-    	code dst = mem_code `snocOL` 
-		   IF_ARCH_i386(GLD size src dst,
-			        MOV size (OpAddr src) (OpReg dst))
-    return (Any size code)
+    Amode addr mem_code <- getAmode mem
+    use_sse2 <- sse2Enabled
+    loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
 
 #if i386_TARGET_ARCH
 getRegister (CmmLoad mem pk)
@@ -933,8 +854,7 @@ getRegister (CmmLit (CmmInt 0 width))
 	size = intSize width
 
 	-- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
-	adj_size = case size of II64 -> II32; _ -> size
-	size1 = IF_ARCH_i386( size, adj_size ) 
+	size1 = IF_ARCH_i386( size, case size of II64 -> II32; _ -> size )
     	code dst 
            = unitOL (XOR size1 (OpReg dst) (OpReg dst))
     in
@@ -1022,7 +942,9 @@ getNonClobberedReg expr = do
 	return (tmp, code tmp)
     Fixed rep reg code
 	-- only free regs can be clobbered
-	| RealReg rr <- reg, isFastTrue (freeReg rr) -> do
+	| RegReal (RealRegSingle rr) <- reg
+	, isFastTrue (freeReg rr) 
+	-> do
 		tmp <- getNewRegNat rep
 		return (tmp, code `snocOL` reg2reg rep reg tmp)
 	| otherwise -> 
@@ -1030,16 +952,13 @@ getNonClobberedReg expr = do
 
 reg2reg :: Size -> Reg -> Reg -> Instr
 reg2reg size src dst 
-#if i386_TARGET_ARCH
-  | isFloatSize size = GMOV src dst
-#endif
-  | otherwise	     = MOV size (OpReg src) (OpReg dst)
-
+  | size == FF80 = GMOV src dst
+  | otherwise	 = MOV size (OpReg src) (OpReg dst)
 
 
 --------------------------------------------------------------------------------
 getAmode :: CmmExpr -> NatM Amode
-getAmode tree@(CmmRegOff _ _) = getAmode (mangleIndexTree tree)
+getAmode (CmmRegOff r n) = getAmode $ mangleIndexTree r n
 
 #if x86_64_TARGET_ARCH
 
@@ -1052,18 +971,18 @@ getAmode (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
 
 -- This is all just ridiculous, since it carefully undoes 
 -- what mangleIndexTree has just done.
-getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
+getAmode (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
   | is32BitLit lit
   -- ASSERT(rep == II32)???
   = do (x_reg, x_code) <- getSomeReg x
        let off = ImmInt (-(fromInteger i))
        return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
   
-getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)])
+getAmode (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
   | is32BitLit lit
   -- ASSERT(rep == II32)???
   = do (x_reg, x_code) <- getSomeReg x
-       let off = ImmInt (fromInteger i)
+       let off = litToImm lit
        return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
 
 -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be 
@@ -1072,12 +991,12 @@ getAmode (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
 				  b@(CmmLit _)])
   = getAmode (CmmMachOp (MO_Add rep) [b,a])
 
-getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _) 
+getAmode (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _) 
 					[y, CmmLit (CmmInt shift _)]])
   | shift == 0 || shift == 1 || shift == 2 || shift == 3
   = x86_complex_amode x y shift 0
 
-getAmode (CmmMachOp (MO_Add rep) 
+getAmode (CmmMachOp (MO_Add _) 
                 [x, CmmMachOp (MO_Add _)
                         [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
                          CmmLit (CmmInt offset _)]])
@@ -1085,7 +1004,7 @@ getAmode (CmmMachOp (MO_Add rep)
   && is32BitInteger offset
   = x86_complex_amode x y shift offset
 
-getAmode (CmmMachOp (MO_Add rep) [x,y])
+getAmode (CmmMachOp (MO_Add _) [x,y])
   = x86_complex_amode x y 0 0
 
 getAmode (CmmLit lit) | is32BitLit lit
@@ -1104,7 +1023,8 @@ x86_complex_amode base index shift offset
        (y_reg, y_code) <- getSomeReg index
        let
     	   code = x_code `appOL` y_code
-           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
+           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
+                                n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
        return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
                code)
 
@@ -1120,58 +1040,81 @@ x86_complex_amode base index shift offset
 -- (see trivialCode where this function is used for an example).
 
 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-#if x86_64_TARGET_ARCH
-getNonClobberedOperand (CmmLit lit)
-  | isSuitableFloatingPointLit lit = do
-    lbl <- getNewLabelNat
-    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
-					   CmmStaticLit lit])
-    return (OpAddr (ripRel (ImmCLbl lbl)), code)
-#endif
-getNonClobberedOperand (CmmLit lit)
-  | is32BitLit lit && not (isFloatType (cmmLitType lit)) =
-    return (OpImm (litToImm lit), nilOL)
-getNonClobberedOperand (CmmLoad mem pk) 
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    (src',save_code) <- 
-	if (amodeCouldBeClobbered src) 
-		then do
-		   tmp <- getNewRegNat archWordSize
-		   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
-			   unitOL (LEA II32 (OpAddr src) (OpReg tmp)))
-		else
-		   return (src, nilOL)
-    return (OpAddr src', save_code `appOL` mem_code)
-getNonClobberedOperand e = do
+getNonClobberedOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if use_sse2 && isSuitableFloatingPointLit lit
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (widthInBytes w) lit
+      return (OpAddr addr, code)
+     else do
+
+  if is32BitLit lit && not (isFloatType (cmmLitType lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getNonClobberedOperand_generic (CmmLit lit)
+
+getNonClobberedOperand (CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2)
+      && IF_ARCH_i386(not (isWord64 pk), True)
+    then do
+      Amode src mem_code <- getAmode mem
+      (src',save_code) <- 
+  	if (amodeCouldBeClobbered src) 
+  		then do
+  		   tmp <- getNewRegNat archWordSize
+  		   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
+  			   unitOL (LEA II32 (OpAddr src) (OpReg tmp)))
+  		else
+  		   return (src, nilOL)
+      return (OpAddr src', save_code `appOL` mem_code)
+    else do
+      getNonClobberedOperand_generic (CmmLoad mem pk)
+
+getNonClobberedOperand e = getNonClobberedOperand_generic e
+
+getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand_generic e = do
     (reg, code) <- getNonClobberedReg e
     return (OpReg reg, code)
 
 amodeCouldBeClobbered :: AddrMode -> Bool
 amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode)
 
-regClobbered (RealReg rr) = isFastTrue (freeReg rr)
+regClobbered :: Reg -> Bool
+regClobbered (RegReal (RealRegSingle rr)) = isFastTrue (freeReg rr)
 regClobbered _ = False
 
 -- getOperand: the operand is not required to remain valid across the
 -- computation of an arbitrary expression.
 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
-#if x86_64_TARGET_ARCH
-getOperand (CmmLit lit)
-  | isSuitableFloatingPointLit lit = do
-    lbl <- getNewLabelNat
-    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
-					   CmmStaticLit lit])
-    return (OpAddr (ripRel (ImmCLbl lbl)), code)
-#endif
-getOperand (CmmLit lit)
-  | is32BitLit lit && not (isFloatType (cmmLitType lit)) = do
-    return (OpImm (litToImm lit), nilOL)
-getOperand (CmmLoad mem pk)
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    return (OpAddr src, mem_code)
-getOperand e = do
+
+getOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if (use_sse2 && isSuitableFloatingPointLit lit)
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (widthInBytes w) lit
+      return (OpAddr addr, code)
+    else do
+
+  if is32BitLit lit && not (isFloatType (cmmLitType lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getOperand_generic (CmmLit lit)
+
+getOperand (CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else
+       getOperand_generic (CmmLoad mem pk)
+
+getOperand e = getOperand_generic e
+
+getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getOperand_generic e = do
     (reg, code) <- getSomeReg e
     return (OpReg reg, code)
 
@@ -1181,28 +1124,67 @@ isOperand (CmmLit lit)  = is32BitLit lit
 			  || isSuitableFloatingPointLit lit
 isOperand _             = False
 
+memConstant :: Int -> CmmLit -> NatM Amode
+memConstant align lit = do
+#ifdef x86_64_TARGET_ARCH
+  lbl <- getNewLabelNat
+  let addr = ripRel (ImmCLbl lbl)
+      addr_code = nilOL
+#else
+  lbl <- getNewLabelNat
+  dflags <- getDynFlagsNat
+  dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
+  Amode addr addr_code <- getAmode dynRef
+#endif
+  let code =
+        LDATA ReadOnlyData
+  		[CmmAlign align,
+                 CmmDataLabel lbl,
+  		 CmmStaticLit lit]
+        `consOL` addr_code
+  return (Amode addr code)
+
+
+loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode use_sse2 w addr addr_code = do
+  let size = floatSize w
+      code dst = addr_code `snocOL`
+                 if use_sse2
+                    then MOV size (OpAddr addr) (OpReg dst)
+                    else GLD size addr dst
+  -- in
+  return (Any (if use_sse2 then size else FF80) code)
+
+
 -- if we want a floating-point literal as an operand, we can
 -- use it directly from memory.  However, if the literal is
 -- zero, we're better off generating it into a register using
 -- xor.
+isSuitableFloatingPointLit :: CmmLit -> Bool
 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
 isSuitableFloatingPointLit _ = False
 
 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
-getRegOrMem (CmmLoad mem pk)
-  | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do
-    Amode src mem_code <- getAmode mem
-    return (OpAddr src, mem_code)
+getRegOrMem e@(CmmLoad mem pk) = do
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else do
+       (reg, code) <- getNonClobberedReg e
+       return (OpReg reg, code)
 getRegOrMem e = do
     (reg, code) <- getNonClobberedReg e
     return (OpReg reg, code)
 
+is32BitLit :: CmmLit -> Bool
 #if x86_64_TARGET_ARCH
 is32BitLit (CmmInt i W64) = is32BitInteger i
    -- assume that labels are in the range 0-2^31-1: this assumes the
    -- small memory model (see gcc docs, -mcmodel=small).
 #endif
-is32BitLit x = True
+is32BitLit _ = True
 
 
 
@@ -1230,20 +1212,20 @@ getCondCode (CmmMachOp mop [x, y])
       MO_F_Lt W64 -> condFltCode LTT x y
       MO_F_Le W64 -> condFltCode LE  x y
 
-      MO_Eq rep -> condIntCode EQQ  x y
-      MO_Ne rep -> condIntCode NE   x y
+      MO_Eq _ -> condIntCode EQQ x y
+      MO_Ne _ -> condIntCode NE  x y
 
-      MO_S_Gt rep -> condIntCode GTT  x y
-      MO_S_Ge rep -> condIntCode GE   x y
-      MO_S_Lt rep -> condIntCode LTT  x y
-      MO_S_Le rep -> condIntCode LE   x y
+      MO_S_Gt _ -> condIntCode GTT x y
+      MO_S_Ge _ -> condIntCode GE  x y
+      MO_S_Lt _ -> condIntCode LTT x y
+      MO_S_Le _ -> condIntCode LE  x y
 
-      MO_U_Gt rep -> condIntCode GU   x y
-      MO_U_Ge rep -> condIntCode GEU  x y
-      MO_U_Lt rep -> condIntCode LU   x y
-      MO_U_Le rep -> condIntCode LEU  x y
+      MO_U_Gt _ -> condIntCode GU  x y
+      MO_U_Ge _ -> condIntCode GEU x y
+      MO_U_Lt _ -> condIntCode LU  x y
+      MO_U_Le _ -> condIntCode LEU x y
 
-      other -> pprPanic "getCondCode(x86,x86_64,sparc)" (ppr (CmmMachOp mop [x,y]))
+      _other -> pprPanic "getCondCode(x86,x86_64,sparc)" (ppr (CmmMachOp mop [x,y]))
 
 getCondCode other =  pprPanic "getCondCode(2)(x86,sparc)" (ppr other)
 
@@ -1267,8 +1249,8 @@ condIntCode cond (CmmLoad x pk) (CmmLit lit) | is32BitLit lit = do
 
 -- anything vs zero, using a mask
 -- TODO: Add some sanity checking!!!!
-condIntCode cond (CmmMachOp (MO_And rep) [x,o2]) (CmmLit (CmmInt 0 pk))
-    | (CmmLit lit@(CmmInt mask pk2)) <- o2, is32BitLit lit
+condIntCode cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
+    | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit lit
     = do
       (x_reg, x_code) <- getSomeReg x
       let
@@ -1312,40 +1294,35 @@ condIntCode cond x y = do
 --------------------------------------------------------------------------------
 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 
-#if i386_TARGET_ARCH
 condFltCode cond x y 
-  = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
-  (x_reg, x_code) <- getNonClobberedReg x
-  (y_reg, y_code) <- getSomeReg y
-  let
-	code = x_code `appOL` y_code `snocOL`
-		GCMP cond x_reg y_reg
-  -- The GCMP insn does the test and sets the zero flag if comparable
-  -- and true.  Hence we always supply EQQ as the condition to test.
-  return (CondCode True EQQ code)
-
-#elif x86_64_TARGET_ARCH
--- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
--- an operand, but the right must be a reg.  We can probably do better
--- than this general case...
-condFltCode cond x y = do
-  (x_reg, x_code) <- getNonClobberedReg x
-  (y_op, y_code) <- getOperand y
-  let
-	code = x_code `appOL`
-	       y_code `snocOL`
-    	    	  CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg)
-	-- NB(1): we need to use the unsigned comparison operators on the
-	-- result of this comparison.
-  -- in
-  return (CondCode True (condToUnsigned cond) code)
-
-#else
-condFltCode 	= panic "X86.condFltCode: not defined"
-
-#endif
-
+  = if_sse2 condFltCode_sse2 condFltCode_x87
+  where
 
+  condFltCode_x87
+    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_reg, y_code) <- getSomeReg y
+    let
+  	code = x_code `appOL` y_code `snocOL`
+  		GCMP cond x_reg y_reg
+    -- The GCMP insn does the test and sets the zero flag if comparable
+    -- and true.  Hence we always supply EQQ as the condition to test.
+    return (CondCode True EQQ code)
+  
+  -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+  -- an operand, but the right must be a reg.  We can probably do better
+  -- than this general case...
+  condFltCode_sse2 = do
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op, y_code) <- getOperand y
+    let
+  	code = x_code `appOL`
+  	       y_code `snocOL`
+      	    	  CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg)
+  	-- NB(1): we need to use the unsigned comparison operators on the
+  	-- result of this comparison.
+    -- in
+    return (CondCode True (condToUnsigned cond) code)
 
 -- -----------------------------------------------------------------------------
 -- Generating assignments
@@ -1411,34 +1388,36 @@ assignMem_IntCode pk addr src = do
 -- Assign; dst is a reg, rhs is mem
 assignReg_IntCode pk reg (CmmLoad src _) = do
   load_code <- intLoadCode (MOV pk) src
-  return (load_code (getRegisterReg reg))
+  return (load_code (getRegisterReg False{-no sse2-} reg))
 
 -- dst is a reg, but src could be anything
-assignReg_IntCode pk reg src = do
+assignReg_IntCode _ reg src = do
   code <- getAnyReg src
-  return (code (getRegisterReg reg))
+  return (code (getRegisterReg False{-no sse2-} reg))
 
 
 -- Floating point assignment to memory
 assignMem_FltCode pk addr src = do
   (src_reg, src_code) <- getNonClobberedReg src
   Amode addr addr_code <- getAmode addr
+  use_sse2 <- sse2Enabled
   let
 	code = src_code `appOL`
 	       addr_code `snocOL`
-                IF_ARCH_i386(GST pk src_reg addr,
-		             MOV pk (OpReg src_reg) (OpAddr addr))
+                if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
+                            else GST pk src_reg addr
   return code
 
 -- Floating point assignment to a register/temporary
-assignReg_FltCode pk reg src = do
+assignReg_FltCode _ reg src = do
+  use_sse2 <- sse2Enabled
   src_code <- getAnyReg src
-  return (src_code (getRegisterReg reg))
+  return (src_code (getRegisterReg use_sse2 reg))
 
 
 genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
 
-genJump (CmmLoad mem pk) = do
+genJump (CmmLoad mem _) = do
   Amode target code <- getAmode mem
   return (code `snocOL` JMP (OpAddr target))
 
@@ -1475,15 +1454,10 @@ genCondJump
     -> CmmExpr      -- the condition on which to branch
     -> NatM InstrBlock
 
-#if i386_TARGET_ARCH
-genCondJump id bool = do
-  CondCode _ cond code <- getCondCode bool
-  return (code `snocOL` JXX cond id)
-
-#elif x86_64_TARGET_ARCH
 genCondJump id bool = do
   CondCode is_float cond cond_code <- getCondCode bool
-  if not is_float
+  use_sse2 <- sse2Enabled
+  if not is_float || not use_sse2
     then
 	return (cond_code `snocOL` JXX cond id)
     else do
@@ -1511,13 +1485,6 @@ genCondJump id bool = do
 		]
 	return (cond_code `appOL` code)
 
-#else
-genCondJump	= panic "X86.genCondJump: not defined"
-
-#endif
-
-
-
 
 -- -----------------------------------------------------------------------------
 --  Generating C calls
@@ -1543,11 +1510,19 @@ genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
 	-- write barrier compiles to no code on x86/x86-64; 
 	-- we keep it this long in order to prevent earlier optimisations.
 
+-- void return type prim op
+genCCall (CmmPrim op) [] args =
+    outOfLineCmmOp op Nothing args
+
 -- we only cope with a single result for foreign calls
-genCCall (CmmPrim op) [CmmHinted r _] args = do
+genCCall (CmmPrim op) [r_hinted@(CmmHinted r _)] args = do
   l1 <- getNewLabelNat
   l2 <- getNewLabelNat
-  case op of
+  sse2 <- sse2Enabled
+  if sse2
+    then
+      outOfLineCmmOp op (Just r_hinted) args
+    else case op of
 	MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
 	MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
 	
@@ -1560,12 +1535,17 @@ genCCall (CmmPrim op) [CmmHinted r _] args = do
 	MO_F32_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
 	MO_F64_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
 	
-	other_op    -> outOfLineFloatOp op r args
+	_other_op   -> outOfLineCmmOp op (Just r_hinted) args
+
  where
   actuallyInlineFloatOp instr size [CmmHinted x _]
-	= do res <- trivialUFCode size (instr size) x
+        = do res <- trivialUFCode size (instr size) x
 	     any <- anyReg res
- 	     return (any (getRegisterReg (CmmLocal r)))
+ 	     return (any (getRegisterReg False (CmmLocal r)))
+
+  actuallyInlineFloatOp _ _ args
+        = panic $ "genCCall.actuallyInlineFloatOp: bad number of arguments! ("
+                ++ show (length args) ++ ")"
 
 genCCall target dest_regs args = do
     let
@@ -1580,22 +1560,25 @@ genCCall target dest_regs args = do
     setDeltaNat (delta0 - arg_pad_size)
 #endif
 
-    push_codes <- mapM push_arg (reverse args)
+    use_sse2 <- sse2Enabled
+    push_codes <- mapM (push_arg use_sse2) (reverse args)
     delta <- getDeltaNat
 
     -- in
     -- deal with static vs dynamic call targets
     (callinsns,cconv) <-
       case target of
-	-- CmmPrim -> ...
         CmmCallee (CmmLit (CmmLabel lbl)) conv
            -> -- ToDo: stdcall arg sizes
 	      return (unitOL (CALL (Left fn_imm) []), conv)
 	   where fn_imm = ImmCLbl lbl
         CmmCallee expr conv
-           -> do { (dyn_c, dyn_r) <- get_op expr
+           -> do { (dyn_r, dyn_c) <- getSomeReg expr
                  ; ASSERT( isWord32 (cmmExprType expr) )
                    return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
+        CmmPrim _
+            -> panic $ "genCCall: Can't handle CmmPrim call type here, error "
+                        ++ "probably because too many return values."
 
     let	push_code
 #if darwin_TARGET_OS
@@ -1606,12 +1589,24 @@ genCCall target dest_regs args = do
             | otherwise
 #endif
             = concatOL push_codes
+	
+	  -- Deallocate parameters after call for ccall;
+	  -- but not for stdcall (callee does it)
+	  --
+	  -- We have to pop any stack padding we added
+	  -- on Darwin even if we are doing stdcall, though (#5052)
+	pop_size | cconv /= StdCallConv = tot_arg_size
+	         | otherwise
+#if darwin_TARGET_OS
+                 = arg_pad_size
+#else
+                 = 0
+#endif
+	
 	call = callinsns `appOL`
                toOL (
-			-- Deallocate parameters after call for ccall;
-			-- but not for stdcall (callee does it)
-                  (if cconv == StdCallConv || tot_arg_size==0 then [] else 
-		   [ADD II32 (OpImm (ImmInt tot_arg_size)) (OpReg esp)])
+                  (if pop_size==0 then [] else 
+		   [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
                   ++
                   [DELTA (delta + tot_arg_size)]
                )
@@ -1622,16 +1617,27 @@ genCCall target dest_regs args = do
 	-- assign the results, if necessary
 	assign_code []     = nilOL
 	assign_code [CmmHinted dest _hint]
-	  | isFloatType ty = unitOL (GMOV fake0 r_dest)
+	  | isFloatType ty = 
+             if use_sse2
+                then let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                                                   EAIndexNone
+                                                   (ImmInt 0)
+                         sz = floatSize w
+                     in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
+                               GST sz fake0 tmp_amode,
+                               MOV sz (OpAddr tmp_amode) (OpReg r_dest),
+                               ADD II32 (OpImm (ImmInt b)) (OpReg esp)]
+                else unitOL (GMOV fake0 r_dest)
 	  | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
 				    MOV II32 (OpReg edx) (OpReg r_dest_hi)]
 	  | otherwise      = unitOL (MOV (intSize w) (OpReg eax) (OpReg r_dest))
 	  where 
 		ty = localRegType dest
 		w  = typeWidth ty
+                b  = widthInBytes w
 		r_dest_hi = getHiVRegFromLo r_dest
-		r_dest    = getRegisterReg (CmmLocal dest)
-	assign_code many = panic "genCCall.assign_code many"
+		r_dest    = getRegisterReg use_sse2 (CmmLocal dest)
+	assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
 
     return (push_code `appOL` 
 	    call `appOL` 
@@ -1641,14 +1647,15 @@ genCCall target dest_regs args = do
     arg_size :: CmmType -> Int	-- Width in bytes
     arg_size ty = widthInBytes (typeWidth ty)
 
+#if darwin_TARGET_OS        
     roundTo a x | x `mod` a == 0 = x
                 | otherwise = x + a - (x `mod` a)
+#endif
 
-
-    push_arg :: HintedCmmActual {-current argument-}
+    push_arg :: Bool -> HintedCmmActual {-current argument-}
                     -> NatM InstrBlock  -- code
 
-    push_arg (CmmHinted arg _hint) -- we don't need the hints on x86
+    push_arg use_sse2 (CmmHinted arg _hint) -- we don't need the hints on x86
       | isWord64 arg_ty = do
         ChildCode64 code r_lo <- iselExpr64 arg
         delta <- getDeltaNat
@@ -1662,32 +1669,35 @@ genCCall target dest_regs args = do
 			     DELTA (delta-8)]
             )
 
-      | otherwise = do
-        (code, reg) <- get_op arg
+      | isFloatType arg_ty = do
+        (reg, code) <- getSomeReg arg
         delta <- getDeltaNat
-        let size = arg_size arg_ty	-- Byte size
         setDeltaNat (delta-size)
-        if (isFloatType arg_ty)
-           then return (code `appOL`
+        return (code `appOL`
                         toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
                               DELTA (delta-size),
-                              GST (floatSize (typeWidth arg_ty))
-				  reg (AddrBaseIndex (EABaseReg esp) 
+                              let addr = AddrBaseIndex (EABaseReg esp) 
                                                         EAIndexNone
-                                                        (ImmInt 0))]
-                       )
-           else return (code `snocOL`
-                        PUSH II32 (OpReg reg) `snocOL`
-                        DELTA (delta-size)
+                                                        (ImmInt 0)
+                                  size = floatSize (typeWidth arg_ty)
+                              in
+                              if use_sse2 
+                                 then MOV size (OpReg reg) (OpAddr addr)
+                                 else GST size reg addr
+                             ]
                        )
+
+      | otherwise = do
+        (operand, code) <- getOperand arg
+        delta <- getDeltaNat
+        setDeltaNat (delta-size)
+        return (code `snocOL`
+                PUSH II32 operand `snocOL`
+                DELTA (delta-size))
+
       where
          arg_ty = cmmExprType arg
-
-    ------------
-    get_op :: CmmExpr -> NatM (InstrBlock, Reg) -- code, reg
-    get_op op = do
-        (reg,code) <- getSomeReg op
-	return (code, reg)
+         size = arg_size arg_ty	-- Byte size
 
 #elif x86_64_TARGET_ARCH
 
@@ -1695,9 +1705,13 @@ genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
 	-- write barrier compiles to no code on x86/x86-64; 
 	-- we keep it this long in order to prevent earlier optimisations.
 
+-- void return type prim op
+genCCall (CmmPrim op) [] args =
+  outOfLineCmmOp op Nothing args
 
-genCCall (CmmPrim op) [CmmHinted r _] args = 
-  outOfLineFloatOp op r args
+-- we only cope with a single result for foreign calls
+genCCall (CmmPrim op) [res] args =
+  outOfLineCmmOp op (Just res) args
 
 genCCall target dest_regs args = do
 
@@ -1741,7 +1755,6 @@ genCCall target dest_regs args = do
     -- deal with static vs dynamic call targets
     (callinsns,cconv) <-
       case target of
-	-- CmmPrim -> ...
         CmmCallee (CmmLit (CmmLabel lbl)) conv
            -> -- ToDo: stdcall arg sizes
 	      return (unitOL (CALL (Left fn_imm) arg_regs), conv)
@@ -1749,15 +1762,18 @@ genCCall target dest_regs args = do
         CmmCallee expr conv
            -> do (dyn_r, dyn_c) <- getSomeReg expr
 		 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+        CmmPrim _
+            -> panic $ "genCCall: Can't handle CmmPrim call type here, error "
+                        ++ "probably because too many return values."
 
     let
-	-- The x86_64 ABI requires us to set %al to the number of SSE
+	-- The x86_64 ABI requires us to set %al to the number of SSE2
 	-- registers that contain arguments, if the called routine
 	-- is a varargs function.  We don't know whether it's a
 	-- varargs function or not, so we have to assume it is.
 	--
 	-- It's not safe to omit this assignment, even if the number
-	-- of SSE regs in use is zero.  If %al is larger than 8
+	-- of SSE2 regs in use is zero.  If %al is larger than 8
 	-- on entry to a varargs function, seg faults ensue.
 	assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
 
@@ -1779,12 +1795,12 @@ genCCall target dest_regs args = do
 	assign_code [CmmHinted dest _hint] = 
 	  case typeWidth rep of
 		W32 | isFloatType rep -> unitOL (MOV (floatSize W32) (OpReg xmm0) (OpReg r_dest))
-		W64 | isFloatType rep -> unitOL (MOV (floatSize W32) (OpReg xmm0) (OpReg r_dest))
+		W64 | isFloatType rep -> unitOL (MOV (floatSize W64) (OpReg xmm0) (OpReg r_dest))
 		_ -> unitOL (MOV (cmmTypeSize rep) (OpReg rax) (OpReg r_dest))
 	  where 
 		rep = localRegType dest
-		r_dest = getRegisterReg (CmmLocal dest)
-	assign_code many = panic "genCCall.assign_code many"
+		r_dest = getRegisterReg True (CmmLocal dest)
+	assign_code _many = panic "genCCall.assign_code many"
 
     return (load_args_code 	`appOL` 
 	    adjust_rsp 		`appOL`
@@ -1826,7 +1842,7 @@ genCCall target dest_regs args = do
 	    return ((CmmHinted arg hint):args', ars, frs, code')
 
     push_args [] code = return code
-    push_args ((CmmHinted arg hint):rest) code
+    push_args ((CmmHinted arg _):rest) code
        | isFloatType arg_rep = do
 	 (arg_reg, arg_code) <- getSomeReg arg
          delta <- getDeltaNat
@@ -1859,28 +1875,25 @@ genCCall	= panic "X86.genCCAll: not defined"
 #endif /* x86_64_TARGET_ARCH */
 
 
-
-
-outOfLineFloatOp :: CallishMachOp -> CmmFormal -> HintedCmmActuals -> NatM InstrBlock
-outOfLineFloatOp mop res args
+outOfLineCmmOp :: CallishMachOp -> Maybe HintedCmmFormal -> HintedCmmActuals -> NatM InstrBlock
+outOfLineCmmOp mop res args
   = do
       dflags <- getDynFlagsNat
       targetExpr <- cmmMakeDynamicReference dflags addImportNat CallReference lbl
       let target = CmmCallee targetExpr CCallConv
-        
-      if isFloat64 (localRegType res)
-        then
-          stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn)
-        else do
-          uq <- getUniqueNat
-          let 
-            tmp = LocalReg uq f64
-          -- in
-          code1 <- stmtToInstrs (CmmCall target [CmmHinted tmp NoHint] args CmmUnsafe CmmMayReturn)
-          code2 <- stmtToInstrs (CmmAssign (CmmLocal res) (CmmReg (CmmLocal tmp)))
-          return (code1 `appOL` code2)
+     
+      stmtToInstrs (CmmCall target (catMaybes [res]) args' CmmUnsafe CmmMayReturn)
   where
-	lbl = mkForeignLabel fn Nothing False IsFunction
+	-- Assume we can call these functions directly, and that they're not in a dynamic library.
+	-- TODO: Why is this ok? Under linux this code will be in libm.so
+	--	 Is is because they're really implemented as a primitive instruction by the assembler??  -- BL 2009/12/31 
+	lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
+
+        args' = case mop of
+                    MO_Memcpy    -> init args
+                    MO_Memset    -> init args
+                    MO_Memmove   -> init args
+                    _            -> args
 
 	fn = case mop of
 	      MO_F32_Sqrt  -> fsLit "sqrtf"
@@ -1915,8 +1928,11 @@ outOfLineFloatOp mop res args
 	      MO_F64_Tanh  -> fsLit "tanh"
 	      MO_F64_Pwr   -> fsLit "pow"
 
+	      MO_Memcpy    -> fsLit "memcpy"
+	      MO_Memset    -> fsLit "memset"
+	      MO_Memmove   -> fsLit "memmove"
 
-
+              other -> panic $ "outOfLineCmmOp: unmatched op! (" ++ show other ++ ")"
 
 
 -- -----------------------------------------------------------------------------
@@ -1932,16 +1948,7 @@ genSwitch expr ids
         dflags <- getDynFlagsNat
         dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
         (tableReg,t_code) <- getSomeReg $ dynRef
-        let
-            jumpTable = map jumpTableEntryRel ids
-            
-            jumpTableEntryRel Nothing
-                = CmmStaticLit (CmmInt 0 wordWidth)
-            jumpTableEntryRel (Just (BlockId id))
-                = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
-                where blockLabel = mkAsmTempLabel id
-
-            op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
+        let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
                                        (EAIndex reg wORD_SIZE) (ImmInt 0))
 
 #if x86_64_TARGET_ARCH
@@ -1954,8 +1961,7 @@ genSwitch expr ids
     
             code = e_code `appOL` t_code `appOL` toOL [
                             ADD (intSize wordWidth) op (OpReg tableReg),
-                            JMP_TBL (OpReg tableReg) [ id | Just id <- ids ],
-                            LDATA Text (CmmDataLabel lbl : jumpTable)
+                            JMP_TBL (OpReg tableReg) ids Text lbl
                     ]
 #else
     -- HACK: On x86_64 binutils<2.17 is only able to generate PC32
@@ -1965,20 +1971,15 @@ genSwitch expr ids
     -- conjunction with the hack in PprMach.hs/pprDataItem once
     -- binutils 2.17 is standard.
             code = e_code `appOL` t_code `appOL` toOL [
-			    LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
-			    MOVSxL II32
-				   (OpAddr (AddrBaseIndex (EABaseReg tableReg)
-							  (EAIndex reg wORD_SIZE) (ImmInt 0)))
-				   (OpReg reg),
+			    MOVSxL II32 op (OpReg reg),
 			    ADD (intSize wordWidth) (OpReg reg) (OpReg tableReg),
-			    JMP_TBL (OpReg tableReg) [ id | Just id <- ids ]
+			    JMP_TBL (OpReg tableReg) ids ReadOnlyData lbl
 		   ]
 #endif
 #else
             code = e_code `appOL` t_code `appOL` toOL [
-                            LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                             ADD (intSize wordWidth) op (OpReg tableReg),
-                            JMP_TBL (OpReg tableReg) [ id | Just id <- ids ]
+                            JMP_TBL (OpReg tableReg) ids ReadOnlyData lbl
                     ]
 #endif
         return code
@@ -1986,16 +1987,29 @@ genSwitch expr ids
   = do
         (reg,e_code) <- getSomeReg expr
         lbl <- getNewLabelNat
-        let
-            jumpTable = map jumpTableEntry ids
-            op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
+        let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
             code = e_code `appOL` toOL [
-                    LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
-                    JMP_TBL op [ id | Just id <- ids ]
+                    JMP_TBL op ids ReadOnlyData lbl
                  ]
         -- in
         return code
 
+generateJumpTableForInstr :: Instr -> Maybe (NatCmmTop Instr)
+generateJumpTableForInstr (JMP_TBL _ ids section lbl) = Just (createJumpTable ids section lbl)
+generateJumpTableForInstr _ = Nothing
+
+createJumpTable :: [Maybe BlockId] -> Section -> CLabel -> GenCmmTop CmmStatic h g
+createJumpTable ids section lbl
+    = let jumpTable
+            | opt_PIC =
+                  let jumpTableEntryRel Nothing
+                          = CmmStaticLit (CmmInt 0 wordWidth)
+                      jumpTableEntryRel (Just blockid)
+                          = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
+                          where blockLabel = mkAsmTempLabel (getUnique blockid)
+                  in map jumpTableEntryRel ids
+            | otherwise = map jumpTableEntry ids
+      in CmmData section (CmmDataLabel lbl : jumpTable)
 
 -- -----------------------------------------------------------------------------
 -- 'condIntReg' and 'condFltReg': condition codes into registers
@@ -2022,72 +2036,64 @@ condIntReg cond x y = do
 
 
 condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-condFltReg cond x y = do
-  CondCode _ cond cond_code <- condFltCode cond x y
-  tmp <- getNewRegNat II8
-  let 
-	code dst = cond_code `appOL` toOL [
-		    SETCC cond (OpReg tmp),
-		    MOVZxL II8 (OpReg tmp) (OpReg dst)
-		  ]
-  -- in
-  return (Any II32 code)
-
-#elif x86_64_TARGET_ARCH
-condFltReg cond x y = do
-  CondCode _ cond cond_code <- condFltCode cond x y
-  tmp1 <- getNewRegNat wordSize
-  tmp2 <- getNewRegNat wordSize
-  let 
-	-- We have to worry about unordered operands (eg. comparisons
-	-- against NaN).  If the operands are unordered, the comparison
-	-- sets the parity flag, carry flag and zero flag.
-	-- All comparisons are supposed to return false for unordered
-	-- operands except for !=, which returns true.
-	--
-	-- Optimisation: we don't have to test the parity flag if we
-	-- know the test has already excluded the unordered case: eg >
-	-- and >= test for a zero carry flag, which can only occur for
-	-- ordered operands.
-	--
-	-- ToDo: by reversing comparisons we could avoid testing the
-	-- parity flag in more cases.
-
-	code dst = 
-	   cond_code `appOL` 
-	     (case cond of
-	  	NE  -> or_unordered dst
-		GU  -> plain_test   dst
-		GEU -> plain_test   dst
-		_   -> and_ordered  dst)
-
-	plain_test dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    MOVZxL II8 (OpReg tmp1) (OpReg dst)
-		 ]
-	or_unordered dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    SETCC PARITY (OpReg tmp2),
-		    OR II8 (OpReg tmp1) (OpReg tmp2),
-		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
-		  ]
-	and_ordered dst = toOL [
-		    SETCC cond (OpReg tmp1),
-		    SETCC NOTPARITY (OpReg tmp2),
-		    AND II8 (OpReg tmp1) (OpReg tmp2),
-		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
-		  ]
-  -- in
-  return (Any II32 code)
-
-#else
-condFltReg	= panic "X86.condFltReg: not defined"
-
-#endif
-
-
+condFltReg cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
+ where
+  condFltReg_x87 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp <- getNewRegNat II8
+    let 
+  	code dst = cond_code `appOL` toOL [
+  		    SETCC cond (OpReg tmp),
+  		    MOVZxL II8 (OpReg tmp) (OpReg dst)
+  		  ]
+    -- in
+    return (Any II32 code)
+  
+  condFltReg_sse2 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp1 <- getNewRegNat archWordSize
+    tmp2 <- getNewRegNat archWordSize
+    let 
+  	-- We have to worry about unordered operands (eg. comparisons
+  	-- against NaN).  If the operands are unordered, the comparison
+  	-- sets the parity flag, carry flag and zero flag.
+  	-- All comparisons are supposed to return false for unordered
+  	-- operands except for !=, which returns true.
+  	--
+  	-- Optimisation: we don't have to test the parity flag if we
+  	-- know the test has already excluded the unordered case: eg >
+  	-- and >= test for a zero carry flag, which can only occur for
+  	-- ordered operands.
+  	--
+  	-- ToDo: by reversing comparisons we could avoid testing the
+  	-- parity flag in more cases.
+  
+  	code dst = 
+  	   cond_code `appOL` 
+  	     (case cond of
+  	  	NE  -> or_unordered dst
+  		GU  -> plain_test   dst
+  		GEU -> plain_test   dst
+  		_   -> and_ordered  dst)
+  
+  	plain_test dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    MOVZxL II8 (OpReg tmp1) (OpReg dst)
+  		 ]
+  	or_unordered dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    SETCC PARITY (OpReg tmp2),
+  		    OR II8 (OpReg tmp1) (OpReg tmp2),
+  		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+  		  ]
+  	and_ordered dst = toOL [
+  		    SETCC cond (OpReg tmp1),
+  		    SETCC NOTPARITY (OpReg tmp2),
+  		    AND II8 (OpReg tmp1) (OpReg tmp2),
+  		    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+  		  ]
+    -- in
+    return (Any II32 code)
 
 
 -- -----------------------------------------------------------------------------
@@ -2148,7 +2154,10 @@ SDM's version of The Rules:
   register happens to be the destination register.
 -}
 
-trivialCode width instr (Just revinstr) (CmmLit lit_a) b
+trivialCode :: Width -> (Operand -> Operand -> Instr)
+            -> Maybe (Operand -> Operand -> Instr)
+            -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode width _ (Just revinstr) (CmmLit lit_a) b
   | is32BitLit lit_a = do
   b_code <- getAnyReg b
   let
@@ -2158,10 +2167,12 @@ trivialCode width instr (Just revinstr) (CmmLit lit_a) b
   -- in
   return (Any (intSize width) code)
 
-trivialCode width instr maybe_revinstr a b
+trivialCode width instr _ a b
   = genTrivialCode (intSize width) instr a b
 
 -- This is re-used for floating pt instructions too.
+genTrivialCode :: Size -> (Operand -> Operand -> Instr)
+               -> CmmExpr -> CmmExpr -> NatM Register
 genTrivialCode rep instr a b = do
   (b_op, b_code) <- getNonClobberedOperand b
   a_code <- getAnyReg a
@@ -2186,12 +2197,15 @@ genTrivialCode rep instr a b = do
   -- in
   return (Any rep code)
 
+regClashesWithOp :: Reg -> Operand -> Bool
 reg `regClashesWithOp` OpReg reg2   = reg == reg2
 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
-reg `regClashesWithOp` _            = False
+_   `regClashesWithOp` _            = False
 
 -----------
 
+trivialUCode :: Size -> (Operand -> Instr)
+             -> CmmExpr -> NatM Register
 trivialUCode rep instr x = do
   x_code <- getAnyReg x
   let
@@ -2202,27 +2216,27 @@ trivialUCode rep instr x = do
 
 -----------
 
-#if i386_TARGET_ARCH
-
-trivialFCode width instr x y = do
+trivialFCode_x87 :: (Size -> Reg -> Reg -> Reg -> Instr)
+                 -> CmmExpr -> CmmExpr -> NatM Register
+trivialFCode_x87 instr x y = do
   (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
   (y_reg, y_code) <- getSomeReg y
   let
-     size = floatSize width
+     size = FF80 -- always, on x87
      code dst =
 	x_code `appOL`
 	y_code `snocOL`
 	instr size x_reg y_reg dst
   return (Any size code)
 
-#endif
+trivialFCode_sse2 :: Width -> (Size -> Operand -> Operand -> Instr)
+                  -> CmmExpr -> CmmExpr -> NatM Register
+trivialFCode_sse2 pk instr x y
+    = genTrivialCode size (instr size) x y
+    where size = floatSize pk
 
-#if x86_64_TARGET_ARCH
-trivialFCode pk instr x y 
-  = genTrivialCode size (instr size) x y
-  where size = floatSize pk
-#endif
 
+trivialUFCode :: Size -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
 trivialUFCode size instr x = do
   (x_reg, x_code) <- getSomeReg x
   let
@@ -2235,79 +2249,86 @@ trivialUFCode size instr x = do
 
 --------------------------------------------------------------------------------
 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-coerceInt2FP from to x = do
-  (x_reg, x_code) <- getSomeReg x
-  let
-        opc  = case to of W32 -> GITOF; W64 -> GITOD
-        code dst = x_code `snocOL` opc x_reg dst
-	-- ToDo: works for non-II32 reps?
-  return (Any (floatSize to) code)
-
-#elif x86_64_TARGET_ARCH
-coerceInt2FP from to x = do
-  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
-  let
-        opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
-        code dst = x_code `snocOL` opc x_op dst
-  -- in
-  return (Any (floatSize to) code) -- works even if the destination rep is <II32
-
-#else
-coerceInt2FP	= panic "X86.coerceInt2FP: not defined"
-
-#endif
-
-
-
+coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
+ where
+   coerce_x87 = do
+     (x_reg, x_code) <- getSomeReg x
+     let
+           opc  = case to of W32 -> GITOF; W64 -> GITOD;
+                             n -> panic $ "coerceInt2FP.x87: unhandled width ("
+                                         ++ show n ++ ")"
+           code dst = x_code `snocOL` opc x_reg dst
+   	-- ToDo: works for non-II32 reps?
+     return (Any FF80 code)
+   
+   coerce_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
+                             n -> panic $ "coerceInt2FP.sse: unhandled width ("
+                                         ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intSize from) x_op dst
+     -- in
+     return (Any (floatSize to) code)
+        -- works even if the destination rep is <II32
 
 --------------------------------------------------------------------------------
 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
-
-#if i386_TARGET_ARCH
-coerceFP2Int from to x = do
-  (x_reg, x_code) <- getSomeReg x
-  let
-        opc  = case from of W32 -> GFTOI; W64 -> GDTOI
-        code dst = x_code `snocOL` opc x_reg dst
-	-- ToDo: works for non-II32 reps?
-  -- in
-  return (Any (intSize to) code)
-
-#elif x86_64_TARGET_ARCH
-coerceFP2Int from to x = do
-  (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
-  let
-        opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ
-        code dst = x_code `snocOL` opc x_op dst
-  -- in
-  return (Any (intSize to) code) -- works even if the destination rep is <II32
-
-#else
-coerceFP2Int	= panic "X86.coerceFP2Int: not defined"
-
-#endif
-
-
+coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
+ where
+   coerceFP2Int_x87 = do
+     (x_reg, x_code) <- getSomeReg x
+     let
+           opc  = case from of W32 -> GFTOI; W64 -> GDTOI
+                               n -> panic $ "coerceFP2Int.x87: unhandled width ("
+                                           ++ show n ++ ")"
+           code dst = x_code `snocOL` opc x_reg dst
+   	-- ToDo: works for non-II32 reps?
+     -- in
+     return (Any (intSize to) code)
+   
+   coerceFP2Int_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
+                               n -> panic $ "coerceFP2Init.sse: unhandled width ("
+                                           ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intSize to) x_op dst
+     -- in
+     return (Any (intSize to) code)
+         -- works even if the destination rep is <II32
 
 
 --------------------------------------------------------------------------------
 coerceFP2FP :: Width -> CmmExpr -> NatM Register
-
-#if x86_64_TARGET_ARCH
 coerceFP2FP to x = do
+  use_sse2 <- sse2Enabled
   (x_reg, x_code) <- getSomeReg x
   let
-        opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD
+        opc | use_sse2  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+                                     n -> panic $ "coerceFP2FP: unhandled width ("
+                                                 ++ show n ++ ")"
+            | otherwise = GDTOF
         code dst = x_code `snocOL` opc x_reg dst
   -- in
-  return (Any (floatSize to) code)
-
-#else
-coerceFP2FP	= panic "X86.coerceFP2FP: not defined"
-
-#endif
-
+  return (Any (if use_sse2 then floatSize to else FF80) code)
 
+--------------------------------------------------------------------------------
 
+sse2NegCode :: Width -> CmmExpr -> NatM Register
+sse2NegCode w x = do
+  let sz = floatSize w
+  x_code <- getAnyReg x
+  -- This is how gcc does it, so it can't be that bad:
+  let
+    const | FF32 <- sz = CmmInt 0x80000000 W32
+          | otherwise  = CmmInt 0x8000000000000000 W64
+  Amode amode amode_code <- memConstant (widthInBytes w) const
+  tmp <- getNewRegNat sz
+  let
+    code dst = x_code dst `appOL` amode_code `appOL` toOL [
+        MOV sz (OpAddr amode) (OpReg tmp),
+	XOR sz (OpReg tmp) (OpReg dst)
+	]
+  --
+  return (Any sz code)