[project @ 2005-05-21 15:39:00 by panne]

[ghc-hetmet.git] / ghc / compiler / nativeGen / MachCodeGen.hs
diff --git a/ghc/compiler/nativeGen/MachCodeGen.hs b/ghc/compiler/nativeGen/MachCodeGen.hs

index 84d6d0d..24e8b04 100644 (file)
--- a/ghc/compiler/nativeGen/MachCodeGen.hs
+++ b/ghc/compiler/nativeGen/MachCodeGen.hs
@@ -35,7 +35,6 @@ import ForeignCall    ( CCallConv(..) )
  import OrdList
  import Pretty
  import Outputable
-import qualified Outputable
  import FastString
  import FastTypes       ( isFastTrue )
  import Constants       ( wORD_SIZE )
@@ -513,17 +512,17 @@ getRegisterReg (CmmGlobal mid)
  
  getRegister :: CmmExpr -> NatM Register
  
+getRegister (CmmReg (CmmGlobal PicBaseReg))
+  = do
+      reg <- getPicBaseNat wordRep
+      return (Fixed wordRep reg nilOL)
+
  getRegister (CmmReg reg) 
    = return (Fixed (cmmRegRep reg) (getRegisterReg reg) nilOL)
  
  getRegister tree@(CmmRegOff _ _) 
    = getRegister (mangleIndexTree tree)
  
-getRegister CmmPicBaseReg
-  = do
-      reg <- getPicBaseNat wordRep
-      return (Fixed wordRep reg nilOL)
-
  -- end of machine-"independent" bit; here we go on the rest...
  
  #if alpha_TARGET_ARCH
@@ -809,8 +808,7 @@ getRegister (CmmLit (CmmFloat f rep)) = do
             LDATA ReadOnlyData
                         [CmmDataLabel lbl,
                          CmmStaticLit (CmmFloat f rep)],
-           MOV rep (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
-       -- ToDo: should use %rip-relative
+           MOV rep (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
             ]
      -- in
      return (Any rep code)
@@ -869,9 +867,10 @@ getRegister (CmmMachOp (MO_S_Conv I32 I64) [CmmLoad addr _]) = do
  
  #if x86_64_TARGET_ARCH
  getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
+  x_code <- getAnyReg x
    lbl <- getNewLabelNat
    let
-    code dst = toOL [
+    code dst = x_code dst `appOL` toOL [
         -- This is how gcc does it, so it can't be that bad:
         LDATA ReadOnlyData16 [
                 CmmAlign 16,
@@ -881,7 +880,7 @@ getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
                 CmmStaticLit (CmmInt 0 I32),
                 CmmStaticLit (CmmInt 0 I32)
         ],
-       XOR F32 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
+       XOR F32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
                 -- xorps, so we need the 128-bit constant
                 -- ToDo: rip-relative
         ]
@@ -889,10 +888,11 @@ getRegister (CmmMachOp (MO_S_Neg F32) [x]) = do
    return (Any F32 code)
  
  getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do
+  x_code <- getAnyReg x
    lbl <- getNewLabelNat
    let
         -- This is how gcc does it, so it can't be that bad:
-    code dst = toOL [
+    code dst = x_code dst `appOL` toOL [
         LDATA ReadOnlyData16 [
                 CmmAlign 16,
                 CmmDataLabel lbl,
@@ -900,9 +900,8 @@ getRegister (CmmMachOp (MO_S_Neg F64) [x]) = do
                 CmmStaticLit (CmmInt 0 I64)
         ],
                 -- gcc puts an unpck here.  Wonder if we need it.
-       XOR F64 (OpAddr (ImmAddr (ImmCLbl lbl) 0)) (OpReg dst)
+       XOR F64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
                 -- xorpd, so we need the 128-bit constant
-               -- ToDo: rip-relative
         ]
    --
    return (Any F64 code)
@@ -1087,23 +1086,25 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
      --------------------
      imulMayOflo :: MachRep -> CmmExpr -> CmmExpr -> NatM Register
      imulMayOflo rep a b = do
-         res_lo <- getNewRegNat rep
-         res_hi <- getNewRegNat rep
           (a_reg, a_code) <- getNonClobberedReg a
-         (b_reg, b_code) <- getSomeReg   b
+         b_code <- getAnyReg b
           let 
-             code dst = a_code `appOL` b_code `appOL`
+            shift_amt  = case rep of
+                          I32 -> 31
+                          I64 -> 63
+                          _ -> panic "shift_amt"
+
+             code = a_code `appOL` b_code eax `appOL`
                          toOL [
-                           MOV rep (OpReg a_reg) (OpReg res_hi),
-                           MOV rep (OpReg b_reg) (OpReg res_lo),
-                           IMUL64 res_hi res_lo,               -- result in res_hi:res_lo
-                           SAR rep (OpImm (ImmInt 31)) (OpReg res_lo), -- sign extend lower part
-                           SUB rep (OpReg res_hi) (OpReg res_lo),      -- compare against upper
-                           MOV rep (OpReg res_lo) (OpReg dst)
-                           -- dst==0 if high part == sign extended low part
+                          IMUL2 rep (OpReg a_reg),   -- result in %edx:%eax
+                           SAR rep (OpImm (ImmInt shift_amt)) (OpReg eax),
+                               -- sign extend lower part
+                           SUB rep (OpReg edx) (OpReg eax)
+                               -- compare against upper
+                           -- eax==0 if high part == sign extended low part
                          ]
           -- in
-        return (Any rep code)
+        return (Fixed rep eax code)
  
      --------------------
      shift_code :: MachRep
@@ -1151,14 +1152,14 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
             code dst
                 = x_code `snocOL`
                  LEA rep
-                       (OpAddr (AddrBaseIndex (Just x_reg) Nothing imm))
+                       (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
                          (OpReg dst)
         -- 
         return (Any rep code)
  
      ----------------------
      div_code rep signed quotient x y = do
-          (y_op, y_code) <- getOperand y -- cannot be clobbered
+          (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
            x_code <- getAnyReg x
            let
              widen | signed    = CLTD rep
@@ -1837,14 +1838,14 @@ getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (-(fromInteger i))
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
    
  getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit@(CmmInt i _)])
    | not (is64BitLit lit)
    -- ASSERT(rep == I32)???
    = do (x_reg, x_code) <- getSomeReg x
         let off = ImmInt (fromInteger i)
-       return (Amode (AddrBaseIndex (Just x_reg) Nothing off) x_code)
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
  
  -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be 
  -- recognised by the next rule.
@@ -1862,7 +1863,7 @@ getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _)
         let
            code = x_code `appOL` y_code
             base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
-       return (Amode (AddrBaseIndex (Just x_reg) (Just (y_reg,base)) (ImmInt 0))
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt 0))
                 code)
  
  getAmode (CmmLit lit) | not (is64BitLit lit)
@@ -1870,7 +1871,7 @@ getAmode (CmmLit lit) | not (is64BitLit lit)
  
  getAmode expr = do
    (reg,code) <- getSomeReg expr
-  return (Amode (AddrBaseIndex (Just reg) Nothing (ImmInt 0)) code)
+  return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
  
  #endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
@@ -1994,6 +1995,14 @@ getAmode other
  #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
+getNonClobberedOperand (CmmLit lit)
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
  getNonClobberedOperand (CmmLit lit)
    | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
      return (OpImm (litToImm lit), nilOL)
@@ -2004,7 +2013,7 @@ getNonClobberedOperand (CmmLoad mem pk)
         if (amodeCouldBeClobbered src) 
                 then do
                    tmp <- getNewRegNat wordRep
-                  return (AddrBaseIndex (Just tmp) Nothing (ImmInt 0),
+                  return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
                            unitOL (LEA I32 (OpAddr src) (OpReg tmp)))
                 else
                    return (src, nilOL)
@@ -2022,23 +2031,38 @@ regClobbered _ = False
  -- getOperand: the operand is not required to remain valid across the
  -- computation of an arbitrary expression.
  getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+#if x86_64_TARGET_ARCH
  getOperand (CmmLit lit)
-  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) =
+  | isSuitableFloatingPointLit lit = do
+    lbl <- getNewLabelNat
+    let code = unitOL (LDATA ReadOnlyData  [CmmDataLabel lbl,
+                                          CmmStaticLit lit])
+    return (OpAddr (ripRel (ImmCLbl lbl)), code)
+#endif
+getOperand (CmmLit lit)
+  | not (is64BitLit lit) && not (isFloatingRep (cmmLitRep lit)) = do
      return (OpImm (litToImm lit), nilOL)
  getOperand (CmmLoad mem pk)
    | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
      Amode src mem_code <- getAmode mem
      return (OpAddr src, mem_code)
  getOperand e = do
-    (reg, code) <- getNonClobberedReg e
+    (reg, code) <- getSomeReg e
      return (OpReg reg, code)
  
  isOperand :: CmmExpr -> Bool
  isOperand (CmmLoad _ _) = True
-isOperand (CmmLit lit)  = not (is64BitLit lit) && 
-                         not (isFloatingRep (cmmLitRep lit))
+isOperand (CmmLit lit)  = not (is64BitLit lit)
+                         || isSuitableFloatingPointLit lit
  isOperand _             = False
  
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory.  However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
  getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
  getRegOrMem (CmmLoad mem pk)
    | IF_ARCH_i386(not (isFloatingRep pk) && pk /= I64, True) = do
@@ -2234,10 +2258,10 @@ condFltCode cond x y = do
         code = x_code `appOL`
                y_code `snocOL`
                   CMP (cmmExprRep x) y_op (OpReg x_reg)
-  -- in
-  return (CondCode False (condToUnsigned cond) code)
-       -- we need to use the unsigned comparison operators on the
+       -- NB(1): we need to use the unsigned comparison operators on the
         -- result of this comparison.
+  -- in
+  return (CondCode True (condToUnsigned cond) code)
  #endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2884,14 +2908,51 @@ genCondJump lbl (StPrim op [x, y])
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
-#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
+#if i386_TARGET_ARCH
  
  genCondJump id bool = do
    CondCode _ cond code <- getCondCode bool
    return (code `snocOL` JXX cond id)
  
-#endif /* i386_TARGET_ARCH */
+#endif
  
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if x86_64_TARGET_ARCH
+
+genCondJump id bool = do
+  CondCode is_float cond cond_code <- getCondCode bool
+  if not is_float
+    then
+       return (cond_code `snocOL` JXX cond id)
+    else do
+       lbl <- getBlockIdNat
+
+       -- see comment with condFltReg
+       let code = case cond of
+                       NE  -> or_unordered
+                       GU  -> plain_test
+                       GEU -> plain_test
+                       _   -> and_ordered
+
+           plain_test = unitOL (
+                 JXX cond id
+               )
+           or_unordered = toOL [
+                 JXX cond id,
+                 JXX PARITY id
+               ]
+           and_ordered = toOL [
+                 JXX PARITY lbl,
+                 JXX cond id,
+                 JXX ALWAYS lbl,
+                 NEWBLOCK lbl
+               ]
+       return (cond_code `appOL` code)
+
+#endif
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if sparc_TARGET_ARCH
  
@@ -3046,12 +3107,12 @@ genCCall target dest_regs args vols = do
         -- CmmPrim -> ...
          CmmForeignCall (CmmLit (CmmLabel lbl)) conv
             -> -- ToDo: stdcall arg sizes
-             return (unitOL (CALL (Left fn_imm)), conv)
+             return (unitOL (CALL (Left fn_imm) []), conv)
            where fn_imm = ImmCLbl lbl
          CmmForeignCall expr conv
             -> do (dyn_c, dyn_r, dyn_rep) <- get_op expr
                   ASSERT(dyn_rep == I32)
-                  return (dyn_c `snocOL` CALL (Right dyn_r), conv)
+                  return (dyn_c `snocOL` CALL (Right dyn_r) [], conv)
  
      let        push_code = concatOL push_codes
         call = callinsns `appOL`
@@ -3118,8 +3179,8 @@ genCCall target dest_regs args vols = do
                          code `appOL`
                          toOL [SUB I32 (OpImm (ImmInt size)) (OpReg esp),
                                DELTA (delta-size),
-                              GST sz reg (AddrBaseIndex (Just esp) 
-                                                        Nothing 
+                              GST sz reg (AddrBaseIndex (EABaseReg esp) 
+                                                        EAIndexNone
                                                          (ImmInt 0))]
                         )
             else return (size,
@@ -3136,6 +3197,9 @@ genCCall target dest_regs args vols = do
          (reg,code) <- getSomeReg op
         return (code, reg, cmmExprRep op)
  
+#endif /* i386_TARGET_ARCH */
+
+#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  outOfLineFloatOp :: CallishMachOp -> CmmReg -> [(CmmExpr,MachHint)]
    -> Maybe [GlobalReg] -> NatM InstrBlock
@@ -3159,6 +3223,10 @@ outOfLineFloatOp mop res args vols
         lbl = CmmLabel (mkForeignLabel fn Nothing False)
  
         fn = case mop of
+             MO_F32_Sqrt  -> FSLIT("sqrt")
+             MO_F32_Sin   -> FSLIT("sin")
+             MO_F32_Cos   -> FSLIT("cos")
+             MO_F32_Tan   -> FSLIT("tan")
               MO_F32_Exp   -> FSLIT("exp")
               MO_F32_Log   -> FSLIT("log")
  
@@ -3171,6 +3239,10 @@ outOfLineFloatOp mop res args vols
               MO_F32_Tanh  -> FSLIT("tanh")
               MO_F32_Pwr   -> FSLIT("pow")
  
+             MO_F64_Sqrt  -> FSLIT("sqrt")
+             MO_F64_Sin   -> FSLIT("sin")
+             MO_F64_Cos   -> FSLIT("cos")
+             MO_F64_Tan   -> FSLIT("tan")
               MO_F64_Exp   -> FSLIT("exp")
               MO_F64_Log   -> FSLIT("log")
  
@@ -3183,24 +3255,29 @@ outOfLineFloatOp mop res args vols
               MO_F64_Tanh  -> FSLIT("tanh")
               MO_F64_Pwr   -> FSLIT("pow")
  
-              other -> pprPanic "outOfLineFloatOp" (pprCallishMachOp mop)
-
-#endif /* i386_TARGET_ARCH */
+#endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #if x86_64_TARGET_ARCH
  
  genCCall (CmmPrim op) [(r,_)] args vols = 
-  panic "genCCall(CmmPrim)(x86_64)"
+  outOfLineFloatOp op r args vols
  
  genCCall target dest_regs args vols = do
  
         -- load up the register arguments
-    (stack_args, sse_regs, load_args_code)
-        <- load_args args allArgRegs allFPArgRegs 0 nilOL
+    (stack_args, aregs, fregs, load_args_code)
+        <- load_args args allArgRegs allFPArgRegs nilOL
  
      let
+       fp_regs_used  = reverse (drop (length fregs) (reverse allFPArgRegs))
+       int_regs_used = reverse (drop (length aregs) (reverse allArgRegs))
+       arg_regs = int_regs_used ++ fp_regs_used
+               -- for annotating the call instruction with
+
+       sse_regs = length fp_regs_used
+
         tot_arg_size = arg_size * length stack_args
  
         -- On entry to the called function, %rsp should be aligned
@@ -3232,11 +3309,11 @@ genCCall target dest_regs args vols = do
         -- CmmPrim -> ...
          CmmForeignCall (CmmLit (CmmLabel lbl)) conv
             -> -- ToDo: stdcall arg sizes
-             return (unitOL (CALL (Left fn_imm)), conv)
+             return (unitOL (CALL (Left fn_imm) arg_regs), conv)
            where fn_imm = ImmCLbl lbl
          CmmForeignCall expr conv
             -> do (dyn_r, dyn_c) <- getSomeReg expr
-                return (dyn_c `snocOL` CALL (Right dyn_r), conv)
+                return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
  
      let
         -- The x86_64 ABI requires us to set %al to the number of SSE
@@ -3287,31 +3364,31 @@ genCCall target dest_regs args vols = do
      load_args :: [(CmmExpr,MachHint)]
               -> [Reg]                  -- int regs avail for args
               -> [Reg]                  -- FP regs avail for args
-             -> Int -> InstrBlock
-             -> NatM ([(CmmExpr,MachHint)],Int,InstrBlock)
-    load_args args [] [] sse_regs code = return (args, sse_regs, code)
+             -> InstrBlock
+             -> NatM ([(CmmExpr,MachHint)],[Reg],[Reg],InstrBlock)
+    load_args args [] [] code     =  return (args, [], [], code)
         -- no more regs to use
-    load_args [] aregs fregs sse_regs code = return ([],sse_regs,code)
+    load_args [] aregs fregs code =  return ([], aregs, fregs, code)
         -- no more args to push
-    load_args ((arg,hint) : rest) aregs fregs sse_regs code
+    load_args ((arg,hint) : rest) aregs fregs code
         | isFloatingRep arg_rep = 
         case fregs of
           [] -> push_this_arg
           (r:rs) -> do
              arg_code <- getAnyReg arg
-            load_args rest aregs rs (sse_regs+1) (code `appOL` arg_code r)
+            load_args rest aregs rs (code `appOL` arg_code r)
         | otherwise =
         case aregs of
           [] -> push_this_arg
           (r:rs) -> do
              arg_code <- getAnyReg arg
-            load_args rest rs fregs sse_regs (code `appOL` arg_code r)
+            load_args rest rs fregs (code `appOL` arg_code r)
         where
           arg_rep = cmmExprRep arg
  
           push_this_arg = do
-           (args',sse',code') <- load_args rest aregs fregs sse_regs code
-           return ((arg,hint):args', sse', code')
+           (args',ars,frs,code') <- load_args rest aregs fregs code
+           return ((arg,hint):args', ars, frs, code')
  
      push_args [] code = return code
      push_args ((arg,hint):rest) code
@@ -3747,7 +3824,7 @@ genSwitch expr ids = do
    lbl <- getNewLabelNat
    let
         jumpTable = map jumpTableEntry ids
-       op = OpAddr (AddrBaseIndex Nothing (Just (reg,wORD_SIZE)) (ImmCLbl lbl))
+       op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
         code = e_code `appOL` toOL [
                 LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
                 JMP_TBL op [ id | Just id <- ids ]
@@ -3839,38 +3916,77 @@ condIntReg cond x y = do
    let 
         code dst = cond_code `appOL` toOL [
                     SETCC cond (OpReg tmp),
-                   MOV I32 (OpReg tmp) (OpReg dst),
-                   AND I32 (OpImm (ImmInt 1)) (OpReg dst)
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
                   ]
-                  -- NB. (1) Tha AND is needed here because the x86 only
-                  -- sets the low byte in the SETCC instruction.
-                  -- NB. (2) The extra temporary register is a hack to
-                  -- work around the fact that the setcc instructions only
-                  -- accept byte registers.  dst might not be a byte-able reg,
-                  -- but currently all free registers are byte-able, so we're
-                  -- guaranteed that a new temporary is byte-able.
    -- in
    return (Any I32 code)
  
+#endif
+
+#if i386_TARGET_ARCH
  
  condFltReg cond x y = do
-  lbl1 <- getBlockIdNat
-  lbl2 <- getBlockIdNat
    CondCode _ cond cond_code <- condFltCode cond x y
-  let
-       code dst = cond_code `appOL` toOL [
-           JXX cond lbl1,
-           MOV I32 (OpImm (ImmInt 0)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl1,
-           MOV I32 (OpImm (ImmInt 1)) (OpReg dst),
-           JXX ALWAYS lbl2,
-           NEWBLOCK lbl2]
-               -- SIGH, have to split up this block somehow...
+  tmp <- getNewRegNat I8
+  let 
+       code dst = cond_code `appOL` toOL [
+                   SETCC cond (OpReg tmp),
+                   MOVZxL I8 (OpReg tmp) (OpReg dst)
+                 ]
    -- in
    return (Any I32 code)
  
-#endif /* i386_TARGET_ARCH */
+#endif
+
+#if x86_64_TARGET_ARCH
+
+condFltReg cond x y = do
+  CondCode _ cond cond_code <- condFltCode cond x y
+  tmp1 <- getNewRegNat wordRep
+  tmp2 <- getNewRegNat wordRep
+  let 
+       -- We have to worry about unordered operands (eg. comparisons
+       -- against NaN).  If the operands are unordered, the comparison
+       -- sets the parity flag, carry flag and zero flag.
+       -- All comparisons are supposed to return false for unordered
+       -- operands except for !=, which returns true.
+       --
+       -- Optimisation: we don't have to test the parity flag if we
+       -- know the test has already excluded the unordered case: eg >
+       -- and >= test for a zero carry flag, which can only occur for
+       -- ordered operands.
+       --
+       -- ToDo: by reversing comparisons we could avoid testing the
+       -- parity flag in more cases.
+
+       code dst = 
+          cond_code `appOL` 
+            (case cond of
+               NE  -> or_unordered dst
+               GU  -> plain_test   dst
+               GEU -> plain_test   dst
+               _   -> and_ordered  dst)
+
+       plain_test dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   MOVZxL I8 (OpReg tmp1) (OpReg dst)
+                ]
+       or_unordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC PARITY (OpReg tmp2),
+                   OR I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+       and_ordered dst = toOL [
+                   SETCC cond (OpReg tmp1),
+                   SETCC NOTPARITY (OpReg tmp2),
+                   AND I8 (OpReg tmp1) (OpReg tmp2),
+                   MOVZxL I8 (OpReg tmp2) (OpReg dst)
+                 ]
+  -- in
+  return (Any I32 code)
+
+#endif
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
@@ -4208,7 +4324,10 @@ trivialCode rep instr (Just revinstr) (CmmLit lit_a) b
    -- in
    return (Any rep code)
  
-trivialCode rep instr maybe_revinstr a b = do
+trivialCode rep instr maybe_revinstr a b = genTrivialCode rep instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode rep instr a b = do
    (b_op, b_code) <- getNonClobberedOperand b
    a_code <- getAnyReg a
    tmp <- getNewRegNat rep
@@ -4220,7 +4339,7 @@ trivialCode rep instr maybe_revinstr a b = do
       -- as the destination reg.  In this case, we have to save b in a
       -- new temporary across the computation of a.
       code dst
-       | dst `clashesWith` b_op =
+       | dst `regClashesWithOp` b_op =
                 b_code `appOL`
                 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
                 a_code dst `snocOL`
@@ -4231,10 +4350,10 @@ trivialCode rep instr maybe_revinstr a b = do
                 instr b_op (OpReg dst)
    -- in
    return (Any rep code)
- where
-  reg `clashesWith` OpReg reg2   = reg == reg2
-  reg `clashesWith` OpAddr amode = any (==reg) (addrModeRegs amode)
-  reg `clashesWith` _            = False
+
+reg `regClashesWithOp` OpReg reg2   = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+reg `regClashesWithOp` _            = False
  
  -----------
  
@@ -4266,19 +4385,7 @@ trivialFCode pk instr x y = do
  
  #if x86_64_TARGET_ARCH
  
--- We use the 2-operand SSE2 floating pt instructions.  ToDo: improve on
--- this by using some of the special cases in trivialCode above.
-trivialFCode pk instr x y = do
-  (y_reg, y_code) <- getNonClobberedReg y -- these work for float regs too
-  x_code <- getAnyReg x
-  let
-     code dst =
-       y_code `appOL`
-       x_code dst `snocOL`
-       instr pk (IF_ARCH_x86_64(OpReg,) y_reg)
-                (IF_ARCH_x86_64(OpReg,) dst)
-  -- in
-  return (Any pk code)
+trivialFCode pk instr x y = genTrivialCode  pk (instr pk) x y
  
  #endif
  
@@ -4545,7 +4652,7 @@ coerceFP2Int from to x = do
  coerceFP2Int from to x = do
    (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
    let
-        opc  = case to of F32 -> CVTSS2SI; F64 -> CVTSD2SI
+        opc  = case from of F32 -> CVTSS2SI; F64 -> CVTSD2SI
          code dst = x_code `snocOL` opc x_op dst
    -- in
    return (Any to code) -- works even if the destination rep is <I32