Fix segfault in array copy primops on 32-bit

[ghc-hetmet.git] / compiler / coreSyn / CoreUnfold.lhs
diff --git a/compiler/coreSyn/CoreUnfold.lhs b/compiler/coreSyn/CoreUnfold.lhs

index f83521c..051e767 100644 (file)
--- a/compiler/coreSyn/CoreUnfold.lhs
+++ b/compiler/coreSyn/CoreUnfold.lhs
@@ -19,13 +19,14 @@ module CoreUnfold (
         Unfolding, UnfoldingGuidance,   -- Abstract types
  
         noUnfolding, mkImplicitUnfolding, 
         Unfolding, UnfoldingGuidance,   -- Abstract types
  
         noUnfolding, mkImplicitUnfolding, 
-       mkTopUnfolding, mkUnfolding, mkCoreUnfolding,
-       mkInlineRule, mkWwInlineRule,
+        mkUnfolding, mkCoreUnfolding,
+       mkTopUnfolding, mkSimpleUnfolding,
+       mkInlineUnfolding, mkInlinableUnfolding, mkWwInlineRule,
         mkCompulsoryUnfolding, mkDFunUnfolding,
  
         interestingArg, ArgSummary(..),
  
         mkCompulsoryUnfolding, mkDFunUnfolding,
  
         interestingArg, ArgSummary(..),
  
-       couldBeSmallEnoughToInline, 
+       couldBeSmallEnoughToInline, inlineBoringOk,
         certainlyWillInline, smallEnoughToInline,
  
         callSiteInline, CallCtxt(..), 
         certainlyWillInline, smallEnoughToInline,
  
         callSiteInline, CallCtxt(..), 
@@ -40,9 +41,11 @@ import StaticFlags
  import DynFlags
  import CoreSyn
  import PprCore         ()      -- Instances
  import DynFlags
  import CoreSyn
  import PprCore         ()      -- Instances
-import OccurAnal
+import TcType           ( tcSplitDFunTy )
+import OccurAnal        ( occurAnalyseExpr )
  import CoreSubst hiding( substTy )
  import CoreFVs         ( exprFreeVars )
  import CoreSubst hiding( substTy )
  import CoreFVs         ( exprFreeVars )
+import CoreArity       ( manifestArity, exprBotStrictness_maybe )
  import CoreUtils
  import Id
  import DataCon
  import CoreUtils
  import Id
  import DataCon
@@ -51,17 +54,19 @@ import Literal
  import PrimOp
  import IdInfo
  import BasicTypes      ( Arity )
  import PrimOp
  import IdInfo
  import BasicTypes      ( Arity )
-import TcType          ( tcSplitDFunTy )
-import Type 
+import Type
  import Coercion
  import PrelNames
  import VarEnv           ( mkInScopeSet )
  import Bag
  import Util
  import Coercion
  import PrelNames
  import VarEnv           ( mkInScopeSet )
  import Bag
  import Util
+import Pair
  import FastTypes
  import FastString
  import Outputable
  import FastTypes
  import FastString
  import Outputable
+import ForeignCall
  
  
+import Data.Maybe
  \end{code}
  
  
  \end{code}
  
  
@@ -72,12 +77,12 @@ import Outputable
  %************************************************************************
  
  \begin{code}
  %************************************************************************
  
  \begin{code}
-mkTopUnfolding :: CoreExpr -> Unfolding
-mkTopUnfolding expr = mkUnfolding True {- Top level -} expr
+mkTopUnfolding :: Bool -> CoreExpr -> Unfolding
+mkTopUnfolding = mkUnfolding InlineRhs True {- Top level -}
  
  mkImplicitUnfolding :: CoreExpr -> Unfolding
  -- For implicit Ids, do a tiny bit of optimising first
  
  mkImplicitUnfolding :: CoreExpr -> Unfolding
  -- For implicit Ids, do a tiny bit of optimising first
-mkImplicitUnfolding expr = mkTopUnfolding (simpleOptExpr expr)
+mkImplicitUnfolding expr = mkTopUnfolding False (simpleOptExpr expr) 
  
  -- Note [Top-level flag on inline rules]
  -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
  -- Note [Top-level flag on inline rules]
  -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -85,25 +90,60 @@ mkImplicitUnfolding expr = mkTopUnfolding (simpleOptExpr expr)
  -- top-level flag to True.  It gets set more accurately by the simplifier
  -- Simplify.simplUnfolding.
  
  -- top-level flag to True.  It gets set more accurately by the simplifier
  -- Simplify.simplUnfolding.
  
-mkUnfolding :: Bool -> CoreExpr -> Unfolding
-mkUnfolding top_lvl expr
-  = mkCoreUnfolding top_lvl expr arity guidance
+mkSimpleUnfolding :: CoreExpr -> Unfolding
+mkSimpleUnfolding = mkUnfolding InlineRhs False False
+
+mkDFunUnfolding :: Type -> [DFunArg CoreExpr] -> Unfolding
+mkDFunUnfolding dfun_ty ops 
+  = DFunUnfolding dfun_nargs data_con ops
    where
    where
-    (arity, guidance) = calcUnfoldingGuidance opt_UF_CreationThreshold expr
-       -- Sometimes during simplification, there's a large let-bound thing     
-       -- which has been substituted, and so is now dead; so 'expr' contains
-       -- two copies of the thing while the occurrence-analysed expression doesn't
-       -- Nevertheless, we *don't* occ-analyse before computing the size because the
-       -- size computation bales out after a while, whereas occurrence analysis does not.
-       --
-       -- This can occasionally mean that the guidance is very pessimistic;
-       -- it gets fixed up next round.  And it should be rare, because large
-       -- let-bound things that are dead are usually caught by preInlineUnconditionally
+    (tvs, n_theta, cls, _) = tcSplitDFunTy dfun_ty
+    dfun_nargs = length tvs + n_theta
+    data_con   = classDataCon cls
  
  
-mkCoreUnfolding :: Bool -> CoreExpr -> Arity -> UnfoldingGuidance -> Unfolding
+mkWwInlineRule :: Id -> CoreExpr -> Arity -> Unfolding
+mkWwInlineRule id expr arity
+  = mkCoreUnfolding (InlineWrapper id) True
+                   (simpleOptExpr expr) arity
+                   (UnfWhen unSaturatedOk boringCxtNotOk)
+
+mkCompulsoryUnfolding :: CoreExpr -> Unfolding
+mkCompulsoryUnfolding expr        -- Used for things that absolutely must be unfolded
+  = mkCoreUnfolding InlineCompulsory True
+                    (simpleOptExpr expr) 0    -- Arity of unfolding doesn't matter
+                    (UnfWhen unSaturatedOk boringCxtOk)
+
+mkInlineUnfolding :: Maybe Arity -> CoreExpr -> Unfolding
+mkInlineUnfolding mb_arity expr 
+  = mkCoreUnfolding InlineStable
+                   True         -- Note [Top-level flag on inline rules]
+                    expr' arity 
+                   (UnfWhen unsat_ok boring_ok)
+  where
+    expr' = simpleOptExpr expr
+    (unsat_ok, arity) = case mb_arity of
+                          Nothing -> (unSaturatedOk, manifestArity expr')
+                          Just ar -> (needSaturated, ar)
+              
+    boring_ok = inlineBoringOk expr'
+
+mkInlinableUnfolding :: CoreExpr -> Unfolding
+mkInlinableUnfolding expr
+  = mkUnfolding InlineStable True is_bot expr'
+  where
+    expr' = simpleOptExpr expr
+    is_bot = isJust (exprBotStrictness_maybe expr')
+\end{code}
+
+Internal functions
+
+\begin{code}
+mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
+                -> Arity -> UnfoldingGuidance -> Unfolding
  -- Occurrence-analyses the expression before capturing it
  -- Occurrence-analyses the expression before capturing it
-mkCoreUnfolding top_lvl expr arity guidance 
+mkCoreUnfolding src top_lvl expr arity guidance 
    = CoreUnfolding { uf_tmpl      = occurAnalyseExpr expr,
    = CoreUnfolding { uf_tmpl      = occurAnalyseExpr expr,
+                   uf_src        = src,
                     uf_arity      = arity,
                     uf_is_top     = top_lvl,
                     uf_is_value   = exprIsHNF        expr,
                     uf_arity      = arity,
                     uf_is_top     = top_lvl,
                     uf_is_value   = exprIsHNF        expr,
@@ -112,35 +152,38 @@ mkCoreUnfolding top_lvl expr arity guidance
                     uf_expandable = exprIsExpandable expr,
                     uf_guidance   = guidance }
  
                     uf_expandable = exprIsExpandable expr,
                     uf_guidance   = guidance }
  
-mkDFunUnfolding :: DataCon -> [Id] -> Unfolding
-mkDFunUnfolding con ops = DFunUnfolding con (map Var ops)
-
-mkWwInlineRule :: Id -> CoreExpr -> Arity -> Unfolding
-mkWwInlineRule id expr arity
-  = mkCoreUnfolding True (simpleOptExpr expr) arity
-         (InlineRule { ir_sat = InlUnSat, ir_info = InlWrapper id })
-
-mkCompulsoryUnfolding :: CoreExpr -> Unfolding
-mkCompulsoryUnfolding expr        -- Used for things that absolutely must be unfolded
-  = mkCoreUnfolding True expr 
-                    0    -- Arity of unfolding doesn't matter
-                    (InlineRule { ir_info = InlAlways, ir_sat = InlUnSat })    
-
-mkInlineRule :: InlSatFlag -> CoreExpr -> Arity -> Unfolding
-mkInlineRule sat expr arity 
-  = mkCoreUnfolding True        -- Note [Top-level flag on inline rules]
-                   expr' arity 
-                   (InlineRule { ir_sat = sat, ir_info = info })
+mkUnfolding :: UnfoldingSource -> Bool -> Bool -> CoreExpr -> Unfolding
+-- Calculates unfolding guidance
+-- Occurrence-analyses the expression before capturing it
+mkUnfolding src top_lvl is_bottoming expr
+  | top_lvl && is_bottoming
+  , not (exprIsTrivial expr)
+  = NoUnfolding    -- See Note [Do not inline top-level bottoming functions]
+  | otherwise
+  = CoreUnfolding { uf_tmpl      = occurAnalyseExpr expr,
+                   uf_src        = src,
+                   uf_arity      = arity,
+                   uf_is_top     = top_lvl,
+                   uf_is_value   = exprIsHNF        expr,
+                    uf_is_conlike = exprIsConLike    expr,
+                   uf_expandable = exprIsExpandable expr,
+                   uf_is_cheap   = is_cheap,
+                   uf_guidance   = guidance }
    where
    where
-    expr' = simpleOptExpr expr
-    info = if small then InlSmall else InlVanilla
-    small = case calcUnfoldingGuidance (arity+1) expr' of
-              (arity_e, UnfoldIfGoodArgs { ug_size = size_e }) 
-                   -> uncondInline arity_e size_e
-              _other {- actually UnfoldNever -} -> False
+    is_cheap = exprIsCheap expr
+    (arity, guidance) = calcUnfoldingGuidance is_cheap
+                                              opt_UF_CreationThreshold expr
+       -- Sometimes during simplification, there's a large let-bound thing     
+       -- which has been substituted, and so is now dead; so 'expr' contains
+       -- two copies of the thing while the occurrence-analysed expression doesn't
+       -- Nevertheless, we *don't* occ-analyse before computing the size because the
+       -- size computation bales out after a while, whereas occurrence analysis does not.
+       --
+       -- This can occasionally mean that the guidance is very pessimistic;
+       -- it gets fixed up next round.  And it should be rare, because large
+       -- let-bound things that are dead are usually caught by preInlineUnconditionally
  \end{code}
  
  \end{code}
  
-
  %************************************************************************
  %*                                                                     *
  \subsection{The UnfoldingGuidance type}
  %************************************************************************
  %*                                                                     *
  \subsection{The UnfoldingGuidance type}
@@ -148,26 +191,57 @@ mkInlineRule sat expr arity
  %************************************************************************
  
  \begin{code}
  %************************************************************************
  
  \begin{code}
+inlineBoringOk :: CoreExpr -> Bool
+-- See Note [INLINE for small functions]
+-- True => the result of inlining the expression is 
+--         no bigger than the expression itself
+--     eg      (\x y -> f y x)
+-- This is a quick and dirty version. It doesn't attempt
+-- to deal with  (\x y z -> x (y z))
+-- The really important one is (x `cast` c)
+inlineBoringOk e
+  = go 0 e
+  where
+    go :: Int -> CoreExpr -> Bool
+    go credit (Lam x e) | isId x           = go (credit+1) e
+                        | otherwise        = go credit e
+    go credit (App f (Type {}))            = go credit f
+    go credit (App f a) | credit > 0  
+                        , exprIsTrivial a  = go (credit-1) f
+    go credit (Note _ e)                  = go credit e     
+    go credit (Cast e _)                  = go credit e
+    go _      (Var {})                            = boringCxtOk
+    go _      _                                   = boringCxtNotOk
+
  calcUnfoldingGuidance
  calcUnfoldingGuidance
-       :: Int                  -- bomb out if size gets bigger than this
-       -> CoreExpr             -- expression to look at
+       :: Bool         -- True <=> the rhs is cheap, or we want to treat it
+                       --          as cheap (INLINE things)     
+        -> Int         -- Bomb out if size gets bigger than this
+       -> CoreExpr     -- Expression to look at
         -> (Arity, UnfoldingGuidance)
         -> (Arity, UnfoldingGuidance)
-calcUnfoldingGuidance bOMB_OUT_SIZE expr
-  = case collectBinders expr of { (binders, body) ->
+calcUnfoldingGuidance expr_is_cheap bOMB_OUT_SIZE expr
+  = case collectBinders expr of { (bndrs, body) ->
      let
      let
-        val_binders = filter isId binders
-       n_val_binders = length val_binders
+        val_bndrs   = filter isId bndrs
+       n_val_bndrs = length val_bndrs
+
+       guidance 
+          = case (sizeExpr (iUnbox bOMB_OUT_SIZE) val_bndrs body) of
+             TooBig -> UnfNever
+             SizeIs size cased_bndrs scrut_discount
+               | uncondInline n_val_bndrs (iBox size)
+                , expr_is_cheap
+               -> UnfWhen unSaturatedOk boringCxtOk   -- Note [INLINE for small functions]
+               | otherwise
+               -> UnfIfGoodArgs { ug_args  = map (discount cased_bndrs) val_bndrs
+                                , ug_size  = iBox size
+                                , ug_res   = iBox scrut_discount }
+
+        discount cbs bndr
+           = foldlBag (\acc (b',n) -> if bndr==b' then acc+n else acc) 
+                     0 cbs
      in
      in
-    case (sizeExpr (iUnbox bOMB_OUT_SIZE) val_binders body) of
-      TooBig -> (n_val_binders, UnfoldNever)
-      SizeIs size cased_args scrut_discount
-       -> (n_val_binders, UnfoldIfGoodArgs { ug_args  = map discount_for val_binders
-                                           , ug_size  = iBox size
-                                           , ug_res   = iBox scrut_discount })
-       where        
-           discount_for b = foldlBag (\acc (b',n) -> if b==b' then acc+n else acc) 
-                                     0 cased_args
-    }
+    (n_val_bndrs, guidance) }
  \end{code}
  
  Note [Computing the size of an expression]
  \end{code}
  
  Note [Computing the size of an expression]
@@ -201,27 +275,67 @@ Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
  a function call to account for.  Notice also that constructor applications 
  are very cheap, because exposing them to a caller is so valuable.
  
  a function call to account for.  Notice also that constructor applications 
  are very cheap, because exposing them to a caller is so valuable.
  
-Note [Unconditional inlining]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
-than the thing it's replacing.  Notice that
+[25/5/11] All sizes are now multiplied by 10, except for primops.
+This makes primops look cheap, and seems to be almost unversally
+beneficial.  Done partly as a result of #4978.
+
+Note [Do not inline top-level bottoming functions]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The FloatOut pass has gone to some trouble to float out calls to 'error' 
+and similar friends.  See Note [Bottoming floats] in SetLevels.
+Do not re-inline them!  But we *do* still inline if they are very small
+(the uncondInline stuff).
+
+
+Note [INLINE for small functions]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider       {-# INLINE f #-}
+                f x = Just x
+                g y = f y
+Then f's RHS is no larger than its LHS, so we should inline it into
+even the most boring context.  In general, f the function is
+sufficiently small that its body is as small as the call itself, the
+inline unconditionally, regardless of how boring the context is.
+
+Things to note:
+
+ * We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
+   than the thing it's replacing.  Notice that
        (f x) --> (g 3)            -- YES, unconditionally
        (f x) --> x : []           -- YES, *even though* there are two
                                   --      arguments to the cons
        x     --> g 3              -- NO
        x            --> Just v            -- NO
  
        (f x) --> (g 3)            -- YES, unconditionally
        (f x) --> x : []           -- YES, *even though* there are two
                                   --      arguments to the cons
        x     --> g 3              -- NO
        x            --> Just v            -- NO
  
-It's very important not to unconditionally replace a variable by
-a non-atomic term.
+  It's very important not to unconditionally replace a variable by
+  a non-atomic term.
+
+* We do this even if the thing isn't saturated, else we end up with the
+  silly situation that
+     f x y = x
+     ...map (f 3)...
+  doesn't inline.  Even in a boring context, inlining without being
+  saturated will give a lambda instead of a PAP, and will be more
+  efficient at runtime.
+
+* However, when the function's arity > 0, we do insist that it 
+  has at least one value argument at the call site.  Otherwise we find this:
+       f = /\a \x:a. x
+       d = /\b. MkD (f b)
+  If we inline f here we get
+       d = /\b. MkD (\x:b. x)
+  and then prepareRhs floats out the argument, abstracting the type
+  variables, so we end up with the original again!
+
  
  \begin{code}
  uncondInline :: Arity -> Int -> Bool
  -- Inline unconditionally if there no size increase
  -- Size of call is arity (+1 for the function)
  
  \begin{code}
  uncondInline :: Arity -> Int -> Bool
  -- Inline unconditionally if there no size increase
  -- Size of call is arity (+1 for the function)
--- See Note [Unconditional inlining]
+-- See Note [INLINE for small functions]
  uncondInline arity size 
    | arity == 0 = size == 0
  uncondInline arity size 
    | arity == 0 = size == 0
-  | otherwise  = size <= arity + 1
+  | otherwise  = size <= 10 * (arity + 1)
  \end{code}
  
  
  \end{code}
  
  
@@ -240,34 +354,34 @@ sizeExpr bOMB_OUT_SIZE top_args expr
      size_up (Cast e _) = size_up e
      size_up (Note _ e) = size_up e
      size_up (Type _)   = sizeZero           -- Types cost nothing
      size_up (Cast e _) = size_up e
      size_up (Note _ e) = size_up e
      size_up (Type _)   = sizeZero           -- Types cost nothing
+    size_up (Coercion _) = sizeZero
      size_up (Lit lit)  = sizeN (litSize lit)
      size_up (Var f)    = size_up_call f []  -- Make sure we get constructor
                                             -- discounts even on nullary constructors
  
      size_up (App fun (Type _)) = size_up fun
      size_up (Lit lit)  = sizeN (litSize lit)
      size_up (Var f)    = size_up_call f []  -- Make sure we get constructor
                                             -- discounts even on nullary constructors
  
      size_up (App fun (Type _)) = size_up fun
-    size_up (App fun arg)      = size_up_app fun [arg]
-                                 `addSize` nukeScrutDiscount (size_up arg)
+    size_up (App fun (Coercion _)) = size_up fun
+    size_up (App fun arg)      = size_up arg  `addSizeNSD`
+                                 size_up_app fun [arg]
  
  
-    size_up (Lam b e) | isId b    = lamScrutDiscount (size_up e `addSizeN` 1)
+    size_up (Lam b e) | isId b    = lamScrutDiscount (size_up e `addSizeN` 10)
                       | otherwise = size_up e
  
      size_up (Let (NonRec binder rhs) body)
                       | otherwise = size_up e
  
      size_up (Let (NonRec binder rhs) body)
-      = nukeScrutDiscount (size_up rhs)                `addSize`
-       size_up body                            `addSizeN`
-       (if isUnLiftedType (idType binder) then 0 else 1)
+      = size_up rhs            `addSizeNSD`
+       size_up body            `addSizeN`
+        (if isUnLiftedType (idType binder) then 0 else 10)
                 -- For the allocation
                 -- If the binder has an unlifted type there is no allocation
  
      size_up (Let (Rec pairs) body)
                 -- For the allocation
                 -- If the binder has an unlifted type there is no allocation
  
      size_up (Let (Rec pairs) body)
-      = nukeScrutDiscount rhs_size             `addSize`
-       size_up body                            `addSizeN`
-       length pairs            -- For the allocation
-      where
-       rhs_size = foldr (addSize . size_up . snd) sizeZero pairs
+      = foldr (addSizeNSD . size_up . snd) 
+              (size_up body `addSizeN` (10 * length pairs))     -- (length pairs) for the allocation
+              pairs
  
      size_up (Case (Var v) _ _ alts) 
         | v `elem` top_args             -- We are scrutinising an argument variable
  
      size_up (Case (Var v) _ _ alts) 
         | v `elem` top_args             -- We are scrutinising an argument variable
-       = alts_size (foldr addSize sizeOne alt_sizes)   -- The 1 is for the case itself
+       = alts_size (foldr1 addAltSize alt_sizes)
                     (foldr1 maxSize alt_sizes)
                 -- Good to inline if an arg is scrutinised, because
                 -- that may eliminate allocation in the caller
                     (foldr1 maxSize alt_sizes)
                 -- Good to inline if an arg is scrutinised, because
                 -- that may eliminate allocation in the caller
@@ -277,9 +391,9 @@ sizeExpr bOMB_OUT_SIZE top_args expr
  
                 -- alts_size tries to compute a good discount for
                 -- the case when we are scrutinising an argument variable
  
                 -- alts_size tries to compute a good discount for
                 -- the case when we are scrutinising an argument variable
-         alts_size (SizeIs tot tot_disc _tot_scrut)           -- Size of all alternatives
-                   (SizeIs max _max_disc  max_scrut)           -- Size of biggest alternative
-               = SizeIs tot (unitBag (v, iBox (_ILIT(1) +# tot -# max)) `unionBags` tot_disc) max_scrut
+         alts_size (SizeIs tot tot_disc tot_scrut)  -- Size of all alternatives
+                   (SizeIs max _        _)          -- Size of biggest alternative
+                = SizeIs tot (unitBag (v, iBox (_ILIT(20) +# tot -# max)) `unionBags` tot_disc) tot_scrut
                         -- If the variable is known, we produce a discount that
                         -- will take us back to 'max', the size of the largest alternative
                         -- The 1+ is a little discount for reduced allocation in the caller
                         -- If the variable is known, we produce a discount that
                         -- will take us back to 'max', the size of the largest alternative
                         -- The 1+ is a little discount for reduced allocation in the caller
@@ -289,22 +403,48 @@ sizeExpr bOMB_OUT_SIZE top_args expr
  
           alts_size tot_size _ = tot_size
  
  
           alts_size tot_size _ = tot_size
  
-    size_up (Case e _ _ alts) = foldr (addSize . size_up_alt) 
-                                     (nukeScrutDiscount (size_up e))
-                                     alts
-                               `addSizeN` 1    -- Add 1 for the case itself
-               -- We don't charge for the case itself
-               -- It's a strict thing, and the price of the call
-               -- is paid by scrut.  Also consider
-               --      case f x of DEFAULT -> e
-               -- This is just ';'!  Don't charge for it.
+    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
+                                foldr (addAltSize . size_up_alt) case_size alts
+      where
+          case_size
+           | is_inline_scrut e, not (lengthExceeds alts 1)  = sizeN (-10)
+           | otherwise = sizeZero
+                -- Normally we don't charge for the case itself, but
+                -- we charge one per alternative (see size_up_alt,
+                -- below) to account for the cost of the info table
+                -- and comparisons.
+                --
+                -- However, in certain cases (see is_inline_scrut
+                -- below), no code is generated for the case unless
+                -- there are multiple alts.  In these cases we
+                -- subtract one, making the first alt free.
+                -- e.g. case x# +# y# of _ -> ...   should cost 1
+                --      case touch# x# of _ -> ...  should cost 0
+                -- (see #4978)
+                --
+                -- I would like to not have the "not (lengthExceeds alts 1)"
+                -- condition above, but without that some programs got worse
+                -- (spectral/hartel/event and spectral/para).  I don't fully
+                -- understand why. (SDM 24/5/11)
+
+                -- unboxed variables, inline primops and unsafe foreign calls
+                -- are all "inline" things:
+          is_inline_scrut (Var v) = isUnLiftedType (idType v)
+          is_inline_scrut scrut
+              | (Var f, _) <- collectArgs scrut
+                = case idDetails f of
+                    FCallId fc  -> not (isSafeForeignCall fc)
+                    PrimOpId op -> not (primOpOutOfLine op)
+                    _other      -> False
+              | otherwise
+                = False
  
      ------------ 
      -- size_up_app is used when there's ONE OR MORE value args
      size_up_app (App fun arg) args 
  
      ------------ 
      -- size_up_app is used when there's ONE OR MORE value args
      size_up_app (App fun arg) args 
-       | isTypeArg arg            = size_up_app fun args
-       | otherwise                = size_up_app fun (arg:args)
-                                    `addSize` nukeScrutDiscount (size_up arg)
+       | isTyCoArg arg            = size_up_app fun args
+       | otherwise                = size_up arg  `addSizeNSD`
+                                     size_up_app fun (arg:args)
      size_up_app (Var fun)     args = size_up_call fun args
      size_up_app other         args = size_up other `addSizeN` length args
  
      size_up_app (Var fun)     args = size_up_call fun args
      size_up_app other         args = size_up other `addSizeN` length args
  
@@ -312,16 +452,20 @@ sizeExpr bOMB_OUT_SIZE top_args expr
      size_up_call :: Id -> [CoreExpr] -> ExprSize
      size_up_call fun val_args
         = case idDetails fun of
      size_up_call :: Id -> [CoreExpr] -> ExprSize
      size_up_call fun val_args
         = case idDetails fun of
-           FCallId _        -> sizeN opt_UF_DearOp
+           FCallId _        -> sizeN (10 * (1 + length val_args))
             DataConWorkId dc -> conSize    dc (length val_args)
             PrimOpId op      -> primOpSize op (length val_args)
            ClassOpId _      -> classOpSize top_args val_args
            _                -> funSize top_args fun (length val_args)
  
      ------------ 
             DataConWorkId dc -> conSize    dc (length val_args)
             PrimOpId op      -> primOpSize op (length val_args)
            ClassOpId _      -> classOpSize top_args val_args
            _                -> funSize top_args fun (length val_args)
  
      ------------ 
-    size_up_alt (_con, _bndrs, rhs) = size_up rhs
+    size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
         -- Don't charge for args, so that wrappers look cheap
         -- (See comments about wrappers with Case)
         -- Don't charge for args, so that wrappers look cheap
         -- (See comments about wrappers with Case)
+       --
+       -- IMPORATANT: *do* charge 1 for the alternative, else we 
+       -- find that giant case nests are treated as practically free
+       -- A good example is Foreign.C.Error.errrnoToIOError
  
      ------------
         -- These addSize things have to be here because
  
      ------------
         -- These addSize things have to be here because
@@ -329,17 +473,29 @@ sizeExpr bOMB_OUT_SIZE top_args expr
      addSizeN TooBig          _  = TooBig
      addSizeN (SizeIs n xs d) m         = mkSizeIs bOMB_OUT_SIZE (n +# iUnbox m) xs d
      
      addSizeN TooBig          _  = TooBig
      addSizeN (SizeIs n xs d) m         = mkSizeIs bOMB_OUT_SIZE (n +# iUnbox m) xs d
      
-    addSize TooBig           _                 = TooBig
-    addSize _                TooBig            = TooBig
-    addSize (SizeIs n1 xs d1) (SizeIs n2 ys d2) 
-       = mkSizeIs bOMB_OUT_SIZE (n1 +# n2) (xs `unionBags` ys) (d1 +# d2)
+        -- addAltSize is used to add the sizes of case alternatives
+    addAltSize TooBig           _      = TooBig
+    addAltSize _                TooBig = TooBig
+    addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2) 
+       = mkSizeIs bOMB_OUT_SIZE (n1 +# n2) 
+                                 (xs `unionBags` ys) 
+                                 (d1 +# d2)   -- Note [addAltSize result discounts]
+
+        -- This variant ignores the result discount from its LEFT argument
+       -- It's used when the second argument isn't part of the result
+    addSizeNSD TooBig           _      = TooBig
+    addSizeNSD _                TooBig = TooBig
+    addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2) 
+       = mkSizeIs bOMB_OUT_SIZE (n1 +# n2) 
+                                 (xs `unionBags` ys) 
+                                 d2  -- Ignore d1
  \end{code}
  
  \begin{code}
  -- | Finds a nominal size of a string literal.
  litSize :: Literal -> Int
  -- Used by CoreUnfold.sizeExpr
  \end{code}
  
  \begin{code}
  -- | Finds a nominal size of a string literal.
  litSize :: Literal -> Int
  -- Used by CoreUnfold.sizeExpr
-litSize (MachStr str) = 1 + ((lengthFS str + 3) `div` 4)
+litSize (MachStr str) = 10 + 10 * ((lengthFS str + 3) `div` 4)
         -- If size could be 0 then @f "x"@ might be too small
         -- [Sept03: make literal strings a bit bigger to avoid fruitless 
         --  duplication of little strings]
         -- If size could be 0 then @f "x"@ might be too small
         -- [Sept03: make literal strings a bit bigger to avoid fruitless 
         --  duplication of little strings]
@@ -354,7 +510,7 @@ classOpSize _ []
  classOpSize top_args (arg1 : other_args)
    = SizeIs (iUnbox size) arg_discount (_ILIT(0))
    where
  classOpSize top_args (arg1 : other_args)
    = SizeIs (iUnbox size) arg_discount (_ILIT(0))
    where
-    size = 2 + length other_args
+    size = 20 + (10 * length other_args)
      -- If the class op is scrutinising a lambda bound dictionary then
      -- give it a discount, to encourage the inlining of this function
      -- The actual discount is rather arbitrarily chosen
      -- If the class op is scrutinising a lambda bound dictionary then
      -- give it a discount, to encourage the inlining of this function
      -- The actual discount is rather arbitrarily chosen
@@ -382,8 +538,7 @@ funSize top_args fun n_val_args
      res_discount | idArity fun > n_val_args = opt_UF_FunAppDiscount
                  | otherwise                = 0
          -- If the function is partially applied, show a result discount
      res_discount | idArity fun > n_val_args = opt_UF_FunAppDiscount
                  | otherwise                = 0
          -- If the function is partially applied, show a result discount
-
-    size | some_val_args = 1 + n_val_args
+    size | some_val_args = 10 * (1 + n_val_args)
           | otherwise     = 0
         -- The 1+ is for the function itself
         -- Add 1 for each non-trivial arg;
           | otherwise     = 0
         -- The 1+ is for the function itself
         -- Add 1 for each non-trivial arg;
@@ -392,40 +547,56 @@ funSize top_args fun n_val_args
  
  conSize :: DataCon -> Int -> ExprSize
  conSize dc n_val_args
  
  conSize :: DataCon -> Int -> ExprSize
  conSize dc n_val_args
-  | n_val_args == 0      = SizeIs (_ILIT(0)) emptyBag (_ILIT(1))       -- Like variables
-  | isUnboxedTupleCon dc = SizeIs (_ILIT(0)) emptyBag (iUnbox n_val_args +# _ILIT(1))
-  | otherwise           = SizeIs (_ILIT(1)) emptyBag (iUnbox n_val_args +# _ILIT(1))
-       -- Treat a constructors application as size 1, regardless of how
-       -- many arguments it has; we are keen to expose them
-       -- (and we charge separately for their args).  We can't treat
-       -- them as size zero, else we find that (Just x) has size 0,
-       -- which is the same as a lone variable; and hence 'v' will 
-       -- always be replaced by (Just x), where v is bound to Just x.
-       --
-       -- However, unboxed tuples count as size zero
-       -- I found occasions where we had 
-       --      f x y z = case op# x y z of { s -> (# s, () #) }
-       -- and f wasn't getting inlined
+  | n_val_args == 0 = SizeIs (_ILIT(0)) emptyBag (_ILIT(10))    -- Like variables
+
+-- See Note [Unboxed tuple result discount]
+  | isUnboxedTupleCon dc = SizeIs (_ILIT(0)) emptyBag (iUnbox (10 * (1 + n_val_args)))
+
+-- See Note [Constructor size]
+  | otherwise = SizeIs (_ILIT(10)) emptyBag (iUnbox (10 * (10 + n_val_args)))
+     -- discont was (10 * (1 + n_val_args)), but it turns out that
+     -- adding a bigger constant here is an unambiguous win.  We
+     -- REALLY like unfolding constructors that get scrutinised.
+     -- [SDM, 25/5/11]
+\end{code}
  
  
+Note [Constructor size]
+~~~~~~~~~~~~~~~~~~~~~~~
+Treat a constructors application as size 1, regardless of how many
+arguments it has; we are keen to expose them (and we charge separately
+for their args).  We can't treat them as size zero, else we find that
+(Just x) has size 0, which is the same as a lone variable; and hence
+'v' will always be replaced by (Just x), where v is bound to Just x.
+
+However, unboxed tuples count as size zero. I found occasions where we had 
+       f x y z = case op# x y z of { s -> (# s, () #) }
+and f wasn't getting inlined.
+
+Note [Unboxed tuple result discount]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+I tried giving unboxed tuples a *result discount* of zero (see the
+commented-out line).  Why?  When returned as a result they do not
+allocate, so maybe we don't want to charge so much for them If you
+have a non-zero discount here, we find that workers often get inlined
+back into wrappers, because it look like
+    f x = case $wf x of (# a,b #) -> (a,b)
+and we are keener because of the case.  However while this change
+shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
+more. All other changes were very small. So it's not a big deal but I
+didn't adopt the idea.
+
+\begin{code}
  primOpSize :: PrimOp -> Int -> ExprSize
  primOpSize op n_val_args
  primOpSize :: PrimOp -> Int -> ExprSize
  primOpSize op n_val_args
- | not (primOpIsDupable op) = sizeN opt_UF_DearOp
- | not (primOpOutOfLine op) = sizeN 1
-       -- Be very keen to inline simple primops.
-       -- We give a discount of 1 for each arg so that (op# x y z) costs 2.
-       -- We can't make it cost 1, else we'll inline let v = (op# x y z) 
-       -- at every use of v, which is excessive.
-       --
-       -- A good example is:
-       --      let x = +# p q in C {x}
-       -- Even though x get's an occurrence of 'many', its RHS looks cheap,
-       -- and there's a good chance it'll get inlined back into C's RHS. Urgh!
-
- | otherwise = sizeN n_val_args
+ = if primOpOutOfLine op
+      then sizeN (op_size + n_val_args)
+      else sizeN op_size
+ where
+   op_size = primOpCodeSize op
  
  
  buildSize :: ExprSize
  
  
  buildSize :: ExprSize
-buildSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(4))
+buildSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(40))
         -- We really want to inline applications of build
         -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
         -- Indeed, we should add a result_discount becuause build is 
         -- We really want to inline applications of build
         -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
         -- Indeed, we should add a result_discount becuause build is 
@@ -434,20 +605,25 @@ buildSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(4))
         -- The "4" is rather arbitrary.
  
  augmentSize :: ExprSize
         -- The "4" is rather arbitrary.
  
  augmentSize :: ExprSize
-augmentSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(4))
+augmentSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(40))
         -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
         -- e plus ys. The -2 accounts for the \cn 
  
         -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
         -- e plus ys. The -2 accounts for the \cn 
  
-nukeScrutDiscount :: ExprSize -> ExprSize
-nukeScrutDiscount (SizeIs n vs _) = SizeIs n vs (_ILIT(0))
-nukeScrutDiscount TooBig          = TooBig
-
  -- When we return a lambda, give a discount if it's used (applied)
  lamScrutDiscount :: ExprSize -> ExprSize
  lamScrutDiscount (SizeIs n vs _) = SizeIs n vs (iUnbox opt_UF_FunAppDiscount)
  lamScrutDiscount TooBig          = TooBig
  \end{code}
  
  -- When we return a lambda, give a discount if it's used (applied)
  lamScrutDiscount :: ExprSize -> ExprSize
  lamScrutDiscount (SizeIs n vs _) = SizeIs n vs (iUnbox opt_UF_FunAppDiscount)
  lamScrutDiscount TooBig          = TooBig
  \end{code}
  
+Note [addAltSize result discounts]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When adding the size of alternatives, we *add* the result discounts
+too, rather than take the *maximum*.  For a multi-branch case, this
+gives a discount for each branch that returns a constructor, making us
+keener to inline.  I did try using 'max' instead, but it makes nofib 
+'rewrite' and 'puzzle' allocate significantly more, and didn't make
+binary sizes shrink significantly either.
+
  Note [Discounts and thresholds]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Constants for discounts and thesholds are defined in main/StaticFlags,
  Note [Discounts and thresholds]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Constants for discounts and thesholds are defined in main/StaticFlags,
@@ -518,17 +694,14 @@ maxSize _              TooBig                               = TooBig
  maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 ># n2  = s1
                                               | otherwise = s2
  
  maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 ># n2  = s1
                                               | otherwise = s2
  
-sizeZero, sizeOne :: ExprSize
+sizeZero :: ExprSize
  sizeN :: Int -> ExprSize
  
  sizeZero = SizeIs (_ILIT(0))  emptyBag (_ILIT(0))
  sizeN :: Int -> ExprSize
  
  sizeZero = SizeIs (_ILIT(0))  emptyBag (_ILIT(0))
-sizeOne  = SizeIs (_ILIT(1))  emptyBag (_ILIT(0))
  sizeN n  = SizeIs (iUnbox n) emptyBag (_ILIT(0))
  \end{code}
  
  
  sizeN n  = SizeIs (iUnbox n) emptyBag (_ILIT(0))
  \end{code}
  
  
-
-
  %************************************************************************
  %*                                                                     *
  \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
  %************************************************************************
  %*                                                                     *
  \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
@@ -543,13 +716,15 @@ actual arguments.
  \begin{code}
  couldBeSmallEnoughToInline :: Int -> CoreExpr -> Bool
  couldBeSmallEnoughToInline threshold rhs 
  \begin{code}
  couldBeSmallEnoughToInline :: Int -> CoreExpr -> Bool
  couldBeSmallEnoughToInline threshold rhs 
-  = case calcUnfoldingGuidance threshold rhs of
-       (_, UnfoldNever) -> False
-       _                -> True
+  = case sizeExpr (iUnbox threshold) [] body of
+       TooBig -> False
+       _      -> True
+  where
+    (_, body) = collectBinders rhs
  
  ----------------
  smallEnoughToInline :: Unfolding -> Bool
  
  ----------------
  smallEnoughToInline :: Unfolding -> Bool
-smallEnoughToInline (CoreUnfolding {uf_guidance = UnfoldIfGoodArgs {ug_size = size}})
+smallEnoughToInline (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
    = size <= opt_UF_UseThreshold
  smallEnoughToInline _
    = False
    = size <= opt_UF_UseThreshold
  smallEnoughToInline _
    = False
@@ -559,10 +734,10 @@ certainlyWillInline :: Unfolding -> Bool
    -- Sees if the unfolding is pretty certain to inline 
  certainlyWillInline (CoreUnfolding { uf_is_cheap = is_cheap, uf_arity = n_vals, uf_guidance = guidance })
    = case guidance of
    -- Sees if the unfolding is pretty certain to inline 
  certainlyWillInline (CoreUnfolding { uf_is_cheap = is_cheap, uf_arity = n_vals, uf_guidance = guidance })
    = case guidance of
-      UnfoldNever     -> False
-      InlineRule {}   -> True
-      UnfoldIfGoodArgs { ug_size = size} 
-                    -> is_cheap && size - (n_vals +1) <= opt_UF_UseThreshold
+      UnfNever      -> False
+      UnfWhen {}    -> True
+      UnfIfGoodArgs { ug_size = size} 
+                    -> is_cheap && size - (10 * (n_vals +1)) <= opt_UF_UseThreshold
  
  certainlyWillInline _
    = False
  
  certainlyWillInline _
    = False
@@ -592,14 +767,13 @@ StrictAnal.addStrictnessInfoToTopId
  
  \begin{code}
  callSiteInline :: DynFlags
  
  \begin{code}
  callSiteInline :: DynFlags
-              -> Bool                  -- True <=> the Id can be inlined
                -> Id                    -- The Id
                -> Id                    -- The Id
+              -> Bool                  -- True <=> unfolding is active
                -> Bool                  -- True if there are are no arguments at all (incl type args)
                -> [ArgSummary]          -- One for each value arg; True if it is interesting
                -> CallCtxt              -- True <=> continuation is interesting
                -> Maybe CoreExpr        -- Unfolding, if any
  
                -> Bool                  -- True if there are are no arguments at all (incl type args)
                -> [ArgSummary]          -- One for each value arg; True if it is interesting
                -> CallCtxt              -- True <=> continuation is interesting
                -> Maybe CoreExpr        -- Unfolding, if any
  
-
  instance Outputable ArgSummary where
    ppr TrivArg    = ptext (sLit "TrivArg")
    ppr NonTrivArg = ptext (sLit "NonTrivArg")
  instance Outputable ArgSummary where
    ppr TrivArg    = ptext (sLit "TrivArg")
    ppr NonTrivArg = ptext (sLit "NonTrivArg")
@@ -628,88 +802,92 @@ instance Outputable CallCtxt where
    ppr CaseCtxt               = ptext (sLit "CaseCtxt")
    ppr ValAppCtxt      = ptext (sLit "ValAppCtxt")
  
    ppr CaseCtxt               = ptext (sLit "CaseCtxt")
    ppr ValAppCtxt      = ptext (sLit "ValAppCtxt")
  
-callSiteInline dflags active_inline id lone_variable arg_infos cont_info
-  = let
-       n_val_args  = length arg_infos
-    in
-    case idUnfolding id of {
-       NoUnfolding      -> Nothing ;
-       OtherCon _       -> Nothing ;
-       DFunUnfolding {} -> Nothing ;   -- Never unfold a DFun
-       CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top, uf_is_value = is_value,
-                       uf_is_cheap = is_cheap, uf_arity = uf_arity, uf_guidance = guidance } ->
+callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
+  = case idUnfolding id of 
+      -- idUnfolding checks for loop-breakers, returning NoUnfolding
+      -- Things with an INLINE pragma may have an unfolding *and* 
+      -- be a loop breaker  (maybe the knot is not yet untied)
+       CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top 
+                     , uf_is_cheap = is_cheap, uf_arity = uf_arity
+                      , uf_guidance = guidance, uf_expandable = is_exp }
+          | active_unfolding -> tryUnfolding dflags id lone_variable 
+                                    arg_infos cont_info unf_template is_top 
+                                    is_cheap is_exp uf_arity guidance
+          | otherwise    -> Nothing
+       NoUnfolding      -> Nothing 
+       OtherCon {}      -> Nothing 
+       DFunUnfolding {} -> Nothing     -- Never unfold a DFun
+
+tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
+             -> CoreExpr -> Bool -> Bool -> Bool -> Arity -> UnfoldingGuidance
+            -> Maybe CoreExpr  
+tryUnfolding dflags id lone_variable 
+             arg_infos cont_info unf_template is_top 
+             is_cheap is_exp uf_arity guidance
                         -- uf_arity will typically be equal to (idArity id), 
                         -- but may be less for InlineRules
                         -- uf_arity will typically be equal to (idArity id), 
                         -- but may be less for InlineRules
-    let
-       result | yes_or_no = Just unf_template
-              | otherwise = Nothing
-
-       interesting_args = any nonTriv arg_infos 
-               -- NB: (any nonTriv arg_infos) looks at the
-               -- over-saturated args too which is "wrong"; 
-               -- but if over-saturated we inline anyway.
-
-              -- some_benefit is used when the RHS is small enough
-              -- and the call has enough (or too many) value
-              -- arguments (ie n_val_args >= arity). But there must
-              -- be *something* interesting about some argument, or the
-              -- result context, to make it worth inlining
-       some_benefit =  interesting_args
-                     || n_val_args > uf_arity      -- Over-saturated
-                     || interesting_saturated_call  -- Exactly saturated
-
-       interesting_saturated_call 
-         = case cont_info of
-             BoringCtxt -> not is_top && uf_arity > 0          -- Note [Nested functions]
-             CaseCtxt   -> not (lone_variable && is_value)     -- Note [Lone variables]
-             ArgCtxt {} -> uf_arity > 0                        -- Note [Inlining in ArgCtxt]
-             ValAppCtxt -> True                                -- Note [Cast then apply]
-
-       yes_or_no
-         = case guidance of
-             UnfoldNever  -> False
-
-             InlineRule { ir_info = inl_info, ir_sat = sat }
-                 | InlAlways <- inl_info -> True         -- No top-level binding, so inline!
-                                                        -- Ignore is_active because we want to 
-                                                         -- inline even if SimplGently is on.
-                | not active_inline     -> False
-                | n_val_args < uf_arity -> yes_unsat    -- Not enough value args
-                | InlSmall <- inl_info  -> True         -- Note [INLINE for small functions]
-                | otherwise             -> some_benefit -- Saturated or over-saturated
-                where
-                  -- See Note [Inlining an InlineRule]
-                  yes_unsat = case sat of 
-                                 InlSat   -> False
-                                InlUnSat -> interesting_args
-
-             UnfoldIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
-                | not active_inline          -> False
-                | not is_cheap               -> False
-                | n_val_args < uf_arity      -> interesting_args && small_enough       
-                                                       -- Note [Unsaturated applications]
-                | uncondInline uf_arity size -> True
-                | otherwise                  -> some_benefit && small_enough
-
-                where
-                  small_enough = (size - discount) <= opt_UF_UseThreshold
-                  discount = computeDiscount uf_arity arg_discounts 
-                                             res_discount arg_infos cont_info
-               
-    in    
-    if dopt Opt_D_dump_inlinings dflags then
-       pprTrace ("Considering inlining: " ++ showSDoc (ppr id))
-                (vcat [text "active:" <+> ppr active_inline,
-                       text "arg infos" <+> ppr arg_infos,
+ | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
+ = pprTrace ("Considering inlining: " ++ showSDoc (ppr id))
+                (vcat [text "arg infos" <+> ppr arg_infos,
+                       text "uf arity" <+> ppr uf_arity,
                         text "interesting continuation" <+> ppr cont_info,
                         text "interesting continuation" <+> ppr cont_info,
-                       text "is value:" <+> ppr is_value,
+                       text "some_benefit" <+> ppr some_benefit,
+                        text "is exp:" <+> ppr is_exp,
                          text "is cheap:" <+> ppr is_cheap,
                         text "guidance" <+> ppr guidance,
                          text "is cheap:" <+> ppr is_cheap,
                         text "guidance" <+> ppr guidance,
+                       extra_doc,
                         text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"])
                         text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"])
-                 result
-    else
-    result
-    }
+                result
+  | otherwise  = result
+
+  where
+    n_val_args = length arg_infos
+    saturated  = n_val_args >= uf_arity
+
+    result | yes_or_no = Just unf_template
+           | otherwise = Nothing
+
+    interesting_args = any nonTriv arg_infos 
+       -- NB: (any nonTriv arg_infos) looks at the
+       -- over-saturated args too which is "wrong"; 
+       -- but if over-saturated we inline anyway.
+
+           -- some_benefit is used when the RHS is small enough
+           -- and the call has enough (or too many) value
+           -- arguments (ie n_val_args >= arity). But there must
+           -- be *something* interesting about some argument, or the
+           -- result context, to make it worth inlining
+    some_benefit 
+       | not saturated = interesting_args      -- Under-saturated
+                                       -- Note [Unsaturated applications]
+       | n_val_args > uf_arity = True  -- Over-saturated
+       | otherwise = interesting_args  -- Saturated
+                  || interesting_saturated_call 
+
+    interesting_saturated_call 
+      = case cont_info of
+          BoringCtxt -> not is_top && uf_arity > 0       -- Note [Nested functions]
+          CaseCtxt   -> not (lone_variable && is_cheap)   -- Note [Lone variables]
+          ArgCtxt {} -> uf_arity > 0                     -- Note [Inlining in ArgCtxt]
+          ValAppCtxt -> True                             -- Note [Cast then apply]
+
+    (yes_or_no, extra_doc)
+      = case guidance of
+          UnfNever -> (False, empty)
+
+          UnfWhen unsat_ok boring_ok 
+             -> (enough_args && (boring_ok || some_benefit), empty )
+             where      -- See Note [INLINE for small functions]
+               enough_args = saturated || (unsat_ok && n_val_args > 0)
+
+          UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
+            -> ( is_cheap && some_benefit && small_enough
+                , (text "discounted size =" <+> int discounted_size) )
+            where
+              discounted_size = size - discount
+              small_enough = discounted_size <= opt_UF_UseThreshold
+              discount = computeDiscount uf_arity arg_discounts 
+                                         res_discount arg_infos cont_info
  \end{code}
  
  Note [RHS of lets]
  \end{code}
  
  Note [RHS of lets]
@@ -743,22 +921,12 @@ But the defn of GHC.Classes.$dmmin is:
      {- Arity: 3, HasNoCafRefs, Strictness: SLL,
         Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
                     case @ a GHC.Classes.<= @ a $dOrd x y of wild {
      {- Arity: 3, HasNoCafRefs, Strictness: SLL,
         Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
                     case @ a GHC.Classes.<= @ a $dOrd x y of wild {
-                     GHC.Bool.False -> y GHC.Bool.True -> x }) -}
+                     GHC.Types.False -> y GHC.Types.True -> x }) -}
  
  We *really* want to inline $dmmin, even though it has arity 3, in
  order to unravel the recursion.
  
  
  
  We *really* want to inline $dmmin, even though it has arity 3, in
  order to unravel the recursion.
  
  
-Note [INLINE for small functions]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Consider       {-# INLINE f #-}
-                f x = Just x
-                g y = f y
-Then f's RHS is no larger than its LHS, so we should inline it
-into even the most boring context.  (We do so if there is no INLINE
-pragma!)  That's the reason for the 'ug_small' flag on an InlineRule.
-
-
  Note [Things to watch]
  ~~~~~~~~~~~~~~~~~~~~~~
  *   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
  Note [Things to watch]
  ~~~~~~~~~~~~~~~~~~~~~~
  *   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
@@ -773,7 +941,7 @@ Note [Things to watch]
  Note [Inlining an InlineRule]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  An InlineRules is used for
  Note [Inlining an InlineRule]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  An InlineRules is used for
-  (a) pogrammer INLINE pragmas
+  (a) programmer INLINE pragmas
    (b) inlinings from worker/wrapper
  
  For (a) the RHS may be large, and our contract is that we *only* inline
    (b) inlinings from worker/wrapper
  
  For (a) the RHS may be large, and our contract is that we *only* inline
@@ -825,8 +993,8 @@ call is at least CONLIKE.  At least for the cases where we use ArgCtxt
  for the RHS of a 'let', we only profit from the inlining if we get a 
  CONLIKE thing (modulo lets).
  
  for the RHS of a 'let', we only profit from the inlining if we get a 
  CONLIKE thing (modulo lets).
  
-Note [Lone variables]
-~~~~~~~~~~~~~~~~~~~~~
+Note [Lone variables]  See also Note [Interaction of exprIsCheap and lone variables]
+~~~~~~~~~~~~~~~~~~~~~   which appears below
  The "lone-variable" case is important.  I spent ages messing about
  with unsatisfactory varaints, but this is nice.  The idea is that if a
  variable appears all alone
  The "lone-variable" case is important.  I spent ages messing about
  with unsatisfactory varaints, but this is nice.  The idea is that if a
  variable appears all alone
@@ -835,7 +1003,7 @@ variable appears all alone
         as scrutinee of a case          CaseCtxt
         as arg of a fn                  ArgCtxt
  AND
         as scrutinee of a case          CaseCtxt
         as arg of a fn                  ArgCtxt
  AND
-       it is bound to a value
+       it is bound to a cheap expression
  
  then we should not inline it (unless there is some other reason,
  e.g. is is the sole occurrence).  That is what is happening at 
  
  then we should not inline it (unless there is some other reason,
  e.g. is is the sole occurrence).  That is what is happening at 
@@ -887,6 +1055,27 @@ However, watch out:
     There's no advantage in inlining f here, and perhaps
     a significant disadvantage.  Hence some_val_args in the Stop case
  
     There's no advantage in inlining f here, and perhaps
     a significant disadvantage.  Hence some_val_args in the Stop case
  
+Note [Interaction of exprIsCheap and lone variables]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The lone-variable test says "don't inline if a case expression
+scrutines a lone variable whose unfolding is cheap".  It's very 
+important that, under these circumstances, exprIsConApp_maybe
+can spot a constructor application. So, for example, we don't
+consider
+       let x = e in (x,x)
+to be cheap, and that's good because exprIsConApp_maybe doesn't
+think that expression is a constructor application.
+
+I used to test is_value rather than is_cheap, which was utterly
+wrong, because the above expression responds True to exprIsHNF.
+
+This kind of thing can occur if you have
+
+       {-# INLINE foo #-}
+       foo = let x = e in (x,x)
+
+which Roman did.
+
  \begin{code}
  computeDiscount :: Int -> [Int] -> Int -> [ArgSummary] -> CallCtxt -> Int
  computeDiscount n_vals_wanted arg_discounts res_discount arg_infos cont_info
  \begin{code}
  computeDiscount :: Int -> [Int] -> Int -> [ArgSummary] -> CallCtxt -> Int
  computeDiscount n_vals_wanted arg_discounts res_discount arg_infos cont_info
@@ -896,10 +1085,10 @@ computeDiscount n_vals_wanted arg_discounts res_discount arg_infos cont_info
         --  *efficiency* to be gained (e.g. beta reductions, case reductions) 
         -- by inlining.
  
         --  *efficiency* to be gained (e.g. beta reductions, case reductions) 
         -- by inlining.
  
-  = 1          -- Discount of 1 because the result replaces the call
+  = 10          -- Discount of 1 because the result replaces the call
                 -- so we count 1 for the function itself
  
                 -- so we count 1 for the function itself
  
-    + length (take n_vals_wanted arg_infos)
+    + 10 * length (take n_vals_wanted arg_infos)
                -- Discount of (un-scaled) 1 for each arg supplied, 
                -- because the result replaces the call
  
                -- Discount of (un-scaled) 1 for each arg supplied, 
                -- because the result replaces the call
  
@@ -909,13 +1098,13 @@ computeDiscount n_vals_wanted arg_discounts res_discount arg_infos cont_info
      arg_discount = sum (zipWith mk_arg_discount arg_discounts arg_infos)
  
      mk_arg_discount _       TrivArg    = 0 
      arg_discount = sum (zipWith mk_arg_discount arg_discounts arg_infos)
  
      mk_arg_discount _       TrivArg    = 0 
-    mk_arg_discount _       NonTrivArg = 1   
+    mk_arg_discount _        NonTrivArg = 10
      mk_arg_discount discount ValueArg   = discount 
  
      res_discount' = case cont_info of
                         BoringCtxt  -> 0
                         CaseCtxt    -> res_discount
      mk_arg_discount discount ValueArg   = discount 
  
      res_discount' = case cont_info of
                         BoringCtxt  -> 0
                         CaseCtxt    -> res_discount
-                       _other      -> 4 `min` res_discount
+                        _other      -> 40 `min` res_discount
                 -- res_discount can be very large when a function returns
                 -- constructors; but we only want to invoke that large discount
                 -- when there's a case continuation.
                 -- res_discount can be very large when a function returns
                 -- constructors; but we only want to invoke that large discount
                 -- when there's a case continuation.
@@ -984,7 +1173,9 @@ interestingArg e = go e 0
           conlike_unfolding = isConLikeUnfolding (idUnfolding v)
  
      go (Type _)          _ = TrivArg
           conlike_unfolding = isConLikeUnfolding (idUnfolding v)
  
      go (Type _)          _ = TrivArg
-    go (App fn (Type _)) n = go fn n    
+    go (Coercion _)      _ = TrivArg
+    go (App fn (Type _)) n = go fn n
+    go (App fn (Coercion _)) n = go fn n
      go (App fn _)        n = go fn (n+1)
      go (Note _ a)       n = go a n
      go (Cast e _)       n = go e n
      go (App fn _)        n = go fn (n+1)
      go (Note _ a)       n = go a n
      go (Cast e _)       n = go e n
@@ -1022,17 +1213,18 @@ However e might not *look* as if
  -- | Returns @Just (dc, [t1..tk], [x1..xn])@ if the argument expression is 
  -- a *saturated* constructor application of the form @dc t1..tk x1 .. xn@,
  -- where t1..tk are the *universally-qantified* type args of 'dc'
  -- | Returns @Just (dc, [t1..tk], [x1..xn])@ if the argument expression is 
  -- a *saturated* constructor application of the form @dc t1..tk x1 .. xn@,
  -- where t1..tk are the *universally-qantified* type args of 'dc'
-exprIsConApp_maybe :: CoreExpr -> Maybe (DataCon, [Type], [CoreExpr])
+exprIsConApp_maybe :: IdUnfoldingFun -> CoreExpr -> Maybe (DataCon, [Type], [CoreExpr])
  
  
-exprIsConApp_maybe (Note _ expr)
-  = exprIsConApp_maybe expr
-       -- We ignore all notes.  For example,
+exprIsConApp_maybe id_unf (Note note expr)
+  | notSccNote note
+  = exprIsConApp_maybe id_unf expr
+       -- We ignore all notes except SCCs.  For example,
         --      case _scc_ "foo" (C a b) of
         --                      C a b -> e
         --      case _scc_ "foo" (C a b) of
         --                      C a b -> e
-       -- should be optimised away, but it will be only if we look
-       -- through the SCC note.
+       -- should not be optimised away, because we'll lose the
+       -- entry count on 'foo'; see Trac #4414
  
  
-exprIsConApp_maybe (Cast expr co)
+exprIsConApp_maybe id_unf (Cast expr co)
    =     -- Here we do the KPush reduction rule as described in the FC paper
         -- The transformation applies iff we have
         --      (C e1 ... en) `cast` co
    =     -- Here we do the KPush reduction rule as described in the FC paper
         -- The transformation applies iff we have
         --      (C e1 ... en) `cast` co
@@ -1040,11 +1232,11 @@ exprIsConApp_maybe (Cast expr co)
         -- The left-hand one must be a T, because exprIsConApp returned True
         -- but the right-hand one might not be.  (Though it usually will.)
  
         -- The left-hand one must be a T, because exprIsConApp returned True
         -- but the right-hand one might not be.  (Though it usually will.)
  
-    case exprIsConApp_maybe expr of {
+    case exprIsConApp_maybe id_unf expr of {
         Nothing                          -> Nothing ;
         Just (dc, _dc_univ_args, dc_args) -> 
  
         Nothing                          -> Nothing ;
         Just (dc, _dc_univ_args, dc_args) -> 
  
-    let (_from_ty, to_ty) = coercionKind co
+    let Pair _from_ty to_ty = coercionKind co
         dc_tc = dataConTyCon dc
      in
      case splitTyConApp_maybe to_ty of {
         dc_tc = dataConTyCon dc
      in
      case splitTyConApp_maybe to_ty of {
@@ -1064,44 +1256,31 @@ exprIsConApp_maybe (Cast expr co)
          dc_ex_tyvars   = dataConExTyVars dc
          arg_tys        = dataConRepArgTys dc
  
          dc_ex_tyvars   = dataConExTyVars dc
          arg_tys        = dataConRepArgTys dc
  
-        dc_eqs :: [(Type,Type)]          -- All equalities from the DataCon
-        dc_eqs = [(mkTyVarTy tv, ty)   | (tv,ty) <- dataConEqSpec dc] ++
-                 [getEqPredTys eq_pred | eq_pred <- dataConEqTheta dc]
-
-        (ex_args, rest1)    = splitAtList dc_ex_tyvars dc_args
-       (co_args, val_args) = splitAtList dc_eqs rest1
+        (ex_args, val_args) = splitAtList dc_ex_tyvars dc_args
  
         -- Make the "theta" from Fig 3 of the paper
          gammas = decomposeCo tc_arity co
  
         -- Make the "theta" from Fig 3 of the paper
          gammas = decomposeCo tc_arity co
-        theta  = zipOpenTvSubst (dc_univ_tyvars ++ dc_ex_tyvars)
-                                (gammas         ++ stripTypeArgs ex_args)
-
-          -- Cast the existential coercion arguments
-        cast_co (ty1, ty2) (Type co) 
-          = Type $ mkSymCoercion (substTy theta ty1)
-                  `mkTransCoercion` co
-                  `mkTransCoercion` (substTy theta ty2)
-        cast_co _ other_arg = pprPanic "cast_co" (ppr other_arg)
-        new_co_args = zipWith cast_co dc_eqs co_args
-  
+        theta  = zipOpenCvSubst (dc_univ_tyvars ++ dc_ex_tyvars)
+                                (gammas         ++ map mkReflCo (stripTypeArgs ex_args))
+
            -- Cast the value arguments (which include dictionaries)
         new_val_args = zipWith cast_arg arg_tys val_args
            -- Cast the value arguments (which include dictionaries)
         new_val_args = zipWith cast_arg arg_tys val_args
-       cast_arg arg_ty arg = mkCoerce (substTy theta arg_ty) arg
+       cast_arg arg_ty arg = mkCoerce (liftCoSubst theta arg_ty) arg
      in
  #ifdef DEBUG
      let dump_doc = vcat [ppr dc,      ppr dc_univ_tyvars, ppr dc_ex_tyvars,
                           ppr arg_tys, ppr dc_args,        ppr _dc_univ_args,
                           ppr ex_args, ppr val_args]
      in
      in
  #ifdef DEBUG
      let dump_doc = vcat [ppr dc,      ppr dc_univ_tyvars, ppr dc_ex_tyvars,
                           ppr arg_tys, ppr dc_args,        ppr _dc_univ_args,
                           ppr ex_args, ppr val_args]
      in
-    ASSERT2( coreEqType _from_ty (mkTyConApp dc_tc _dc_univ_args), dump_doc )
-    ASSERT2( all isTypeArg (ex_args ++ co_args), dump_doc )
+    ASSERT2( eqType _from_ty (mkTyConApp dc_tc _dc_univ_args), dump_doc )
+    ASSERT2( all isTypeArg ex_args, dump_doc )
      ASSERT2( equalLength val_args arg_tys, dump_doc )
  #endif
  
      ASSERT2( equalLength val_args arg_tys, dump_doc )
  #endif
  
-    Just (dc, to_tc_arg_tys, ex_args ++ new_co_args ++ new_val_args)
+    Just (dc, to_tc_arg_tys, ex_args ++ new_val_args)
      }}
  
      }}
  
-exprIsConApp_maybe expr 
+exprIsConApp_maybe id_unf expr 
    = analyse expr [] 
    where
      analyse (App fun arg) args = analyse fun (arg:args)
    = analyse expr [] 
    where
      analyse (App fun arg) args = analyse fun (arg:args)
@@ -1109,54 +1288,50 @@ exprIsConApp_maybe expr
  
      analyse (Var fun) args
         | Just con <- isDataConWorkId_maybe fun
  
      analyse (Var fun) args
         | Just con <- isDataConWorkId_maybe fun
-        , is_saturated
+        , count isValArg args == idArity fun
         , let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars con) args
         = Just (con, stripTypeArgs univ_ty_args, rest_args)
  
         -- Look through dictionary functions; see Note [Unfolding DFuns]
         , let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars con) args
         = Just (con, stripTypeArgs univ_ty_args, rest_args)
  
         -- Look through dictionary functions; see Note [Unfolding DFuns]
-        | DFunUnfolding con ops <- unfolding
-        , is_saturated
-        , let (dfun_tvs, _cls, dfun_res_tys) = tcSplitDFunTy (idType fun)
-             subst = zipOpenTvSubst dfun_tvs (stripTypeArgs (takeList dfun_tvs args))
-        = Just (con, substTys subst dfun_res_tys, 
-                     [mkApps op args | op <- ops])
+        | DFunUnfolding dfun_nargs con ops <- unfolding
+        , let sat = length args == dfun_nargs    -- See Note [DFun arity check]
+          in if sat then True else 
+             pprTrace "Unsaturated dfun" (ppr fun <+> int dfun_nargs $$ ppr args) False   
+        , let (dfun_tvs, _n_theta, _cls, dfun_res_tys) = tcSplitDFunTy (idType fun)
+              subst    = zipOpenTvSubst dfun_tvs (stripTypeArgs (takeList dfun_tvs args))
+              mk_arg (DFunConstArg e) = e
+              mk_arg (DFunLamArg i)   = args !! i
+              mk_arg (DFunPolyArg e)  = mkApps e args
+        = Just (con, substTys subst dfun_res_tys, map mk_arg ops)
  
         -- Look through unfoldings, but only cheap ones, because
         -- we are effectively duplicating the unfolding
  
         -- Look through unfoldings, but only cheap ones, because
         -- we are effectively duplicating the unfolding
-       | CoreUnfolding { uf_expandable = expand_me, uf_tmpl = rhs } <- unfolding
-       , expand_me = -- pprTrace "expanding" (ppr fun $$ ppr rhs) $
-                      analyse rhs args
+       | Just rhs <- expandUnfolding_maybe unfolding
+       = -- pprTrace "expanding" (ppr fun $$ ppr rhs) $
+          analyse rhs args
          where
          where
-         is_saturated = count isValArg args == idArity fun
-          unfolding = idUnfolding fun
+         unfolding = id_unf fun
  
      analyse _ _ = Nothing
  
      -----------
  
      analyse _ _ = Nothing
  
      -----------
-    in_scope = mkInScopeSet (exprFreeVars expr)
-
-    -----------
      beta (Lam v body) pairs (arg : args) 
      beta (Lam v body) pairs (arg : args) 
-        | isTypeArg arg
+        | isTyCoArg arg
          = beta body ((v,arg):pairs) args 
  
      beta (Lam {}) _ _    -- Un-saturated, or not a type lambda
         = Nothing
  
      beta fun pairs args
          = beta body ((v,arg):pairs) args 
  
      beta (Lam {}) _ _    -- Un-saturated, or not a type lambda
         = Nothing
  
      beta fun pairs args
-        = case analyse (substExpr subst fun) args of
-           Nothing  -> -- pprTrace "Bale out! exprIsConApp_maybe" doc $
-                       Nothing
-           Just ans -> -- pprTrace "Woo-hoo! exprIsConApp_maybe" doc $
-                        Just ans
+        = analyse (substExpr (text "subst-expr-is-con-app") subst fun) args
          where
          where
-          subst = mkOpenSubst in_scope pairs
+          subst = mkOpenSubst (mkInScopeSet (exprFreeVars fun)) pairs
           -- doc = vcat [ppr fun, ppr expr, ppr pairs, ppr args]
  
           -- doc = vcat [ppr fun, ppr expr, ppr pairs, ppr args]
  
-
  stripTypeArgs :: [CoreExpr] -> [Type]
  stripTypeArgs args = ASSERT2( all isTypeArg args, ppr args )
                       [ty | Type ty <- args]
  stripTypeArgs :: [CoreExpr] -> [Type]
  stripTypeArgs args = ASSERT2( all isTypeArg args, ppr args )
                       [ty | Type ty <- args]
+  -- We really do want isTypeArg here, not isTyCoArg!
  \end{code}
  
  Note [Unfolding DFuns]
  \end{code}
  
  Note [Unfolding DFuns]
@@ -1171,3 +1346,8 @@ So to split it up we just need to apply the ops $c1, $c2 etc
  to the very same args as the dfun.  It takes a little more work
  to compute the type arguments to the dictionary constructor.
  
  to the very same args as the dfun.  It takes a little more work
  to compute the type arguments to the dictionary constructor.
  
+Note [DFun arity check]
+~~~~~~~~~~~~~~~~~~~~~~~
+Here we check that the total number of supplied arguments (inclding 
+type args) matches what the dfun is expecting.  This may be *less*
+than the ordinary arity of the dfun: see Note [DFun unfoldings] in CoreSyn