X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=compiler%2FsimplCore%2FCoreMonad.lhs;h=67a0991ec0b43d7518ee46e6776b78b626637b99;hp=f80608956273be8db329c19dc5acadd1a25eaa82;hb=5909e9a896d40a18b4bcf6abb95e0b071bfd7db2;hpb=72462499b891d5779c19f3bda03f96e24f9554ae

diff --git a/compiler/simplCore/CoreMonad.lhs b/compiler/simplCore/CoreMonad.lhs
index f806089..67a0991 100644
--- a/compiler/simplCore/CoreMonad.lhs
+++ b/compiler/simplCore/CoreMonad.lhs
@@ -7,11 +7,21 @@
 {-# LANGUAGE UndecidableInstances #-}
 
 module CoreMonad (
+    -- * Configuration of the core-to-core passes
+    CoreToDo(..),
+    SimplifierMode(..),
+    FloatOutSwitches(..),
+    getCoreToDo, dumpSimplPhase,
+
+    -- * Counting
+    SimplCount, doSimplTick, doFreeSimplTick, simplCountN,
+    pprSimplCount, plusSimplCount, zeroSimplCount, isZeroSimplCount, Tick(..),
+
     -- * The monad
     CoreM, runCoreM,
     
     -- ** Reading from the monad
-    getHscEnv, getAnnEnv, getRuleBase, getModule,
+    getHscEnv, getRuleBase, getModule,
     getDynFlags, getOrigNameCache,
     
     -- ** Writing to the monad
@@ -22,16 +32,18 @@ module CoreMonad (
     liftIO1, liftIO2, liftIO3, liftIO4,
     
     -- ** Dealing with annotations
-    findAnnotations, deserializeAnnotations, addAnnotation,
+    getAnnotations, getFirstAnnotations,
     
     -- ** Debug output
-    endPass, endPassIf, 
+    showPass, endPass, endIteration, dumpIfSet,
 
     -- ** Screen output
     putMsg, putMsgS, errorMsg, errorMsgS, 
     fatalErrorMsg, fatalErrorMsgS, 
     debugTraceMsg, debugTraceMsgS,
-    dumpIfSet_dyn,
+    dumpIfSet_dyn, 
+
+    lookupOrigCoreM,
 
 #ifdef GHCI
     -- * Getting 'Name's
@@ -49,11 +61,12 @@ import CoreLint		( lintCoreBindings )
 import PrelNames        ( iNTERACTIVE )
 import HscTypes
 import Module           ( Module )
-import DynFlags         ( DynFlags, DynFlag )
-import SimplMonad       ( SimplCount, plusSimplCount, zeroSimplCount )
+import DynFlags
+import StaticFlags	
 import Rules            ( RuleBase )
+import BasicTypes       ( CompilerPhase(..) )
 import Annotations
-import Serialized
+import Id		( Id )
 
 import IOEnv hiding     ( liftIO, failM, failWithM )
 import qualified IOEnv  ( liftIO )
@@ -63,16 +76,27 @@ import TcRnMonad        ( TcM, initTc )
 import Outputable
 import FastString
 import qualified ErrUtils as Err
+import Bag
 import Maybes
 import UniqSupply
-import LazyUniqFM       ( UniqFM )
+import UniqFM       ( UniqFM, mapUFM, filterUFM )
+import MonadUtils
 
+import Util		( split )
+import Data.List	( intersperse )
 import Data.Dynamic
 import Data.IORef
+import Data.Map (Map)
+import qualified Data.Map as Map
 import Data.Word
 import Control.Monad
 
 import Prelude hiding   ( read )
+import OccName
+import IfaceEnv
+import Name
+import SrcLoc
+import Control.Exception.Base
 
 #ifdef GHCI
 import {-# SOURCE #-} TcSplice ( lookupThName_maybe )
@@ -91,31 +115,708 @@ be, and it makes a conveneint place.  place for them.  They print out
 stuff before and after core passes, and do Core Lint when necessary.
 
 \begin{code}
-endPass :: DynFlags -> String -> DynFlag -> [CoreBind] -> [CoreRule] -> IO ()
-endPass = dumpAndLint Err.dumpIfSet_core
+showPass :: DynFlags -> CoreToDo -> IO ()
+showPass dflags pass = Err.showPass dflags (showSDoc (ppr pass))
+
+endPass :: DynFlags -> CoreToDo -> [CoreBind] -> [CoreRule] -> IO ()
+endPass dflags pass = dumpAndLint dflags True pass empty (coreDumpFlag pass)
+
+-- Same as endPass but doesn't dump Core even with -dverbose-core2core
+endIteration :: DynFlags -> CoreToDo -> Int -> [CoreBind] -> [CoreRule] -> IO ()
+endIteration dflags pass n
+  = dumpAndLint dflags False pass (ptext (sLit "iteration=") <> int n)
+                (Just Opt_D_dump_simpl_iterations)
 
-endPassIf :: Bool -> DynFlags -> String -> DynFlag -> [CoreBind] -> [CoreRule] -> IO ()
-endPassIf cond = dumpAndLint (Err.dumpIf_core cond)
+dumpIfSet :: Bool -> CoreToDo -> SDoc -> SDoc -> IO ()
+dumpIfSet dump_me pass extra_info doc
+  = Err.dumpIfSet dump_me (showSDoc (ppr pass <+> extra_info)) doc
 
-dumpAndLint :: (DynFlags -> DynFlag -> String -> SDoc -> IO ())
-            -> DynFlags -> String -> DynFlag 
+dumpAndLint :: DynFlags -> Bool -> CoreToDo -> SDoc -> Maybe DynFlag
             -> [CoreBind] -> [CoreRule] -> IO ()
-dumpAndLint dump dflags pass_name dump_flag binds rules
+-- The "show_all" parameter says to print dump if -dverbose-core2core is on
+dumpAndLint dflags show_all pass extra_info mb_dump_flag binds rules
   = do {  -- Report result size if required
 	  -- This has the side effect of forcing the intermediate to be evaluated
        ; Err.debugTraceMsg dflags 2 $
 		(text "    Result size =" <+> int (coreBindsSize binds))
 
 	-- Report verbosely, if required
-       ; dump dflags dump_flag pass_name
-              (pprCoreBindings binds $$ ppUnless (null rules) pp_rules)
+       ; let pass_name = showSDoc (ppr pass <+> extra_info)
+             dump_doc  = pprCoreBindings binds 
+                         $$ ppUnless (null rules) pp_rules
+
+       ; case mb_dump_flag of
+            Nothing        -> return ()
+            Just dump_flag -> Err.dumpIfSet_dyn_or dflags dump_flags pass_name dump_doc
+               where
+                 dump_flags | show_all  = [dump_flag, Opt_D_verbose_core2core]
+		 	    | otherwise = [dump_flag] 
 
 	-- Type check
-       ; lintCoreBindings dflags pass_name binds }
+       ; when (dopt Opt_DoCoreLinting dflags) $
+         do { let (warns, errs) = lintCoreBindings binds
+            ; Err.showPass dflags ("Core Linted result of " ++ pass_name)
+            ; displayLintResults dflags pass warns errs binds  } }
   where
     pp_rules = vcat [ blankLine
                     , ptext (sLit "------ Local rules for imported ids --------")
                     , pprRules rules ]
+
+displayLintResults :: DynFlags -> CoreToDo
+                   -> Bag Err.Message -> Bag Err.Message -> [CoreBind]
+                   -> IO ()
+displayLintResults dflags pass warns errs binds
+  | not (isEmptyBag errs)
+  = do { printDump (vcat [ banner "errors", Err.pprMessageBag errs
+			 , ptext (sLit "*** Offending Program ***")
+			 , pprCoreBindings binds
+			 , ptext (sLit "*** End of Offense ***") ])
+       ; Err.ghcExit dflags 1 }
+
+  | not (isEmptyBag warns)
+  , not (case pass of { CoreDesugar -> True; _ -> False })
+    	-- Suppress warnings after desugaring pass because some
+	-- are legitimate. Notably, the desugarer generates instance
+	-- methods with INLINE pragmas that form a mutually recursive
+	-- group.  Only afer a round of simplification are they unravelled.
+  , not opt_NoDebugOutput
+  , showLintWarnings pass
+  = printDump (banner "warnings" $$ Err.pprMessageBag warns)
+
+  | otherwise = return ()
+  where
+    banner string = ptext (sLit "*** Core Lint")      <+> text string 
+                    <+> ptext (sLit ": in result of") <+> ppr pass
+                    <+> ptext (sLit "***")
+
+showLintWarnings :: CoreToDo -> Bool
+-- Disable Lint warnings on the first simplifier pass, because
+-- there may be some INLINE knots still tied, which is tiresomely noisy
+showLintWarnings (CoreDoSimplify _ (SimplMode { sm_phase = InitialPhase })) = False
+showLintWarnings _ = True
+\end{code}
+
+
+%************************************************************************
+%*									*
+              The CoreToDo type and related types
+	  Abstraction of core-to-core passes to run.
+%*									*
+%************************************************************************
+
+\begin{code}
+data CoreToDo           -- These are diff core-to-core passes,
+                        -- which may be invoked in any order,
+                        -- as many times as you like.
+
+  = CoreDoSimplify      -- The core-to-core simplifier.
+        Int                    -- Max iterations
+        SimplifierMode
+
+  | CoreDoFloatInwards
+  | CoreDoFloatOutwards FloatOutSwitches
+  | CoreLiberateCase
+  | CoreDoPrintCore
+  | CoreDoStaticArgs
+  | CoreDoStrictness
+  | CoreDoWorkerWrapper
+  | CoreDoSpecialising
+  | CoreDoSpecConstr
+  | CoreDoGlomBinds
+  | CoreCSE
+  | CoreDoRuleCheck CompilerPhase String   -- Check for non-application of rules
+                                           -- matching this string
+  | CoreDoVectorisation
+  | CoreDoNothing                -- Useful when building up
+  | CoreDoPasses [CoreToDo]      -- lists of these things
+
+  | CoreDesugar	 -- Not strictly a core-to-core pass, but produces
+                 -- Core output, and hence useful to pass to endPass
+
+  | CoreTidy
+  | CorePrep
+
+coreDumpFlag :: CoreToDo -> Maybe DynFlag
+coreDumpFlag (CoreDoSimplify {})      = Just Opt_D_dump_simpl_phases
+coreDumpFlag CoreDoFloatInwards       = Just Opt_D_verbose_core2core
+coreDumpFlag (CoreDoFloatOutwards {}) = Just Opt_D_verbose_core2core
+coreDumpFlag CoreLiberateCase         = Just Opt_D_verbose_core2core
+coreDumpFlag CoreDoStaticArgs 	      = Just Opt_D_verbose_core2core
+coreDumpFlag CoreDoStrictness 	      = Just Opt_D_dump_stranal
+coreDumpFlag CoreDoWorkerWrapper      = Just Opt_D_dump_worker_wrapper
+coreDumpFlag CoreDoSpecialising       = Just Opt_D_dump_spec
+coreDumpFlag CoreDoSpecConstr         = Just Opt_D_dump_spec
+coreDumpFlag CoreCSE                  = Just Opt_D_dump_cse 
+coreDumpFlag CoreDoVectorisation      = Just Opt_D_dump_vect
+coreDumpFlag CoreDesugar              = Just Opt_D_dump_ds 
+coreDumpFlag CoreTidy                 = Just Opt_D_dump_simpl
+coreDumpFlag CorePrep                 = Just Opt_D_dump_prep
+
+coreDumpFlag CoreDoPrintCore         = Nothing
+coreDumpFlag (CoreDoRuleCheck {})    = Nothing
+coreDumpFlag CoreDoNothing           = Nothing
+coreDumpFlag CoreDoGlomBinds         = Nothing
+coreDumpFlag (CoreDoPasses {})       = Nothing
+
+instance Outputable CoreToDo where
+  ppr (CoreDoSimplify n md)  = ptext (sLit "Simplifier")
+                               <+> ppr md
+                                 <+> ptext (sLit "max-iterations=") <> int n
+  ppr CoreDoFloatInwards       = ptext (sLit "Float inwards")
+  ppr (CoreDoFloatOutwards f)  = ptext (sLit "Float out") <> parens (ppr f)
+  ppr CoreLiberateCase         = ptext (sLit "Liberate case")
+  ppr CoreDoStaticArgs 	       = ptext (sLit "Static argument")
+  ppr CoreDoStrictness 	       = ptext (sLit "Demand analysis")
+  ppr CoreDoWorkerWrapper      = ptext (sLit "Worker Wrapper binds")
+  ppr CoreDoSpecialising       = ptext (sLit "Specialise")
+  ppr CoreDoSpecConstr         = ptext (sLit "SpecConstr")
+  ppr CoreCSE                  = ptext (sLit "Common sub-expression")
+  ppr CoreDoVectorisation      = ptext (sLit "Vectorisation")
+  ppr CoreDesugar              = ptext (sLit "Desugar")
+  ppr CoreTidy                 = ptext (sLit "Tidy Core")
+  ppr CorePrep 		       = ptext (sLit "CorePrep")
+  ppr CoreDoPrintCore          = ptext (sLit "Print core")
+  ppr (CoreDoRuleCheck {})     = ptext (sLit "Rule check")
+  ppr CoreDoGlomBinds          = ptext (sLit "Glom binds")
+  ppr CoreDoNothing            = ptext (sLit "CoreDoNothing")
+  ppr (CoreDoPasses {})        = ptext (sLit "CoreDoPasses")
+\end{code}
+
+\begin{code}
+data SimplifierMode             -- See comments in SimplMonad
+  = SimplMode
+        { sm_names      :: [String] -- Name(s) of the phase
+        , sm_phase      :: CompilerPhase
+        , sm_rules      :: Bool     -- Whether RULES are enabled
+        , sm_inline     :: Bool     -- Whether inlining is enabled
+        , sm_case_case  :: Bool     -- Whether case-of-case is enabled
+        , sm_eta_expand :: Bool     -- Whether eta-expansion is enabled
+        }
+
+instance Outputable SimplifierMode where
+    ppr (SimplMode { sm_phase = p, sm_names = ss
+                   , sm_rules = r, sm_inline = i
+                   , sm_eta_expand = eta, sm_case_case = cc })
+       = ptext (sLit "SimplMode") <+> braces (
+         sep [ ptext (sLit "Phase =") <+> ppr p <+>
+               brackets (text (concat $ intersperse "," ss)) <> comma
+             , pp_flag i   (sLit "inline") <> comma
+             , pp_flag r   (sLit "rules") <> comma
+             , pp_flag eta (sLit "eta-expand") <> comma
+             , pp_flag cc  (sLit "case-of-case") ])
+	 where
+           pp_flag f s = ppUnless f (ptext (sLit "no")) <+> ptext s
+\end{code}
+
+
+\begin{code}
+data FloatOutSwitches = FloatOutSwitches {
+  floatOutLambdas   :: Maybe Int,  -- ^ Just n <=> float lambdas to top level, if
+                                   -- doing so will abstract over n or fewer 
+                                   -- value variables
+				   -- Nothing <=> float all lambdas to top level,
+                                   --             regardless of how many free variables
+                                   -- Just 0 is the vanilla case: float a lambda
+                                   --    iff it has no free vars
+
+  floatOutConstants :: Bool,       -- ^ True <=> float constants to top level,
+                                   --            even if they do not escape a lambda
+  floatOutPartialApplications :: Bool -- ^ True <=> float out partial applications
+                                            --            based on arity information.
+  }
+instance Outputable FloatOutSwitches where
+    ppr = pprFloatOutSwitches
+
+pprFloatOutSwitches :: FloatOutSwitches -> SDoc
+pprFloatOutSwitches sw 
+  = ptext (sLit "FOS") <+> (braces $
+     sep $ punctuate comma $ 
+     [ ptext (sLit "Lam =")    <+> ppr (floatOutLambdas sw)
+     , ptext (sLit "Consts =") <+> ppr (floatOutConstants sw)
+     , ptext (sLit "PAPs =")   <+> ppr (floatOutPartialApplications sw) ])
+\end{code}
+
+
+%************************************************************************
+%*									*
+           Generating the main optimisation pipeline
+%*									*
+%************************************************************************
+
+\begin{code}
+getCoreToDo :: DynFlags -> [CoreToDo]
+getCoreToDo dflags
+  = core_todo
+  where
+    opt_level     = optLevel           dflags
+    phases        = simplPhases        dflags
+    max_iter      = maxSimplIterations dflags
+    rule_check    = ruleCheck          dflags
+    strictness    = dopt Opt_Strictness   		  dflags
+    full_laziness = dopt Opt_FullLaziness 		  dflags
+    do_specialise = dopt Opt_Specialise   		  dflags
+    do_float_in   = dopt Opt_FloatIn      		  dflags          
+    cse           = dopt Opt_CSE                          dflags
+    spec_constr   = dopt Opt_SpecConstr                   dflags
+    liberate_case = dopt Opt_LiberateCase                 dflags
+    static_args   = dopt Opt_StaticArgumentTransformation dflags
+    rules_on      = dopt Opt_EnableRewriteRules           dflags
+    eta_expand_on = dopt Opt_DoLambdaEtaExpansion         dflags
+
+    maybe_rule_check phase = runMaybe rule_check (CoreDoRuleCheck phase)
+
+    maybe_strictness_before phase
+      = runWhen (phase `elem` strictnessBefore dflags) CoreDoStrictness
+
+    base_mode = SimplMode { sm_phase      = panic "base_mode"
+                          , sm_names      = []
+                          , sm_rules      = rules_on
+                          , sm_eta_expand = eta_expand_on
+                          , sm_inline     = True
+                          , sm_case_case  = True }
+
+    simpl_phase phase names iter
+      = CoreDoPasses
+      $   [ maybe_strictness_before phase
+          , CoreDoSimplify iter
+                (base_mode { sm_phase = Phase phase
+                           , sm_names = names })
+
+          , maybe_rule_check (Phase phase) ]
+
+          -- Vectorisation can introduce a fair few common sub expressions involving 
+          --  DPH primitives. For example, see the Reverse test from dph-examples.
+          --  We need to eliminate these common sub expressions before their definitions
+          --  are inlined in phase 2. The CSE introduces lots of  v1 = v2 bindings, 
+          --  so we also run simpl_gently to inline them.
+      ++  (if dopt Opt_Vectorise dflags && phase == 3
+	    then [CoreCSE, simpl_gently]
+	    else [])
+
+    vectorisation
+      = runWhen (dopt Opt_Vectorise dflags) $
+          CoreDoPasses [ simpl_gently, CoreDoVectorisation ]
+
+                -- By default, we have 2 phases before phase 0.
+
+                -- Want to run with inline phase 2 after the specialiser to give
+                -- maximum chance for fusion to work before we inline build/augment
+                -- in phase 1.  This made a difference in 'ansi' where an
+                -- overloaded function wasn't inlined till too late.
+
+                -- Need phase 1 so that build/augment get
+                -- inlined.  I found that spectral/hartel/genfft lost some useful
+                -- strictness in the function sumcode' if augment is not inlined
+                -- before strictness analysis runs
+    simpl_phases = CoreDoPasses [ simpl_phase phase ["main"] max_iter
+                                | phase <- [phases, phases-1 .. 1] ]
+
+
+        -- initial simplify: mk specialiser happy: minimum effort please
+    simpl_gently = CoreDoSimplify max_iter
+                       (base_mode { sm_phase = InitialPhase
+                                  , sm_names = ["Gentle"]
+                                  , sm_rules = rules_on   -- Note [RULEs enabled in SimplGently]
+                                  , sm_inline = False
+                                  , sm_case_case = False })
+                          -- Don't do case-of-case transformations.
+                          -- This makes full laziness work better
+
+    core_todo =
+     if opt_level == 0 then
+       [vectorisation,
+        simpl_phase 0 ["final"] max_iter]
+     else {- opt_level >= 1 -} [
+
+    -- We want to do the static argument transform before full laziness as it
+    -- may expose extra opportunities to float things outwards. However, to fix
+    -- up the output of the transformation we need at do at least one simplify
+    -- after this before anything else
+        runWhen static_args (CoreDoPasses [ simpl_gently, CoreDoStaticArgs ]),
+
+        -- We run vectorisation here for now, but we might also try to run
+        -- it later
+        vectorisation,
+
+        -- initial simplify: mk specialiser happy: minimum effort please
+        simpl_gently,
+
+        -- Specialisation is best done before full laziness
+        -- so that overloaded functions have all their dictionary lambdas manifest
+        runWhen do_specialise CoreDoSpecialising,
+
+        runWhen full_laziness $
+           CoreDoFloatOutwards FloatOutSwitches {
+                                 floatOutLambdas   = Just 0,
+                                 floatOutConstants = True,
+                                 floatOutPartialApplications = False },
+      		-- Was: gentleFloatOutSwitches	
+                --
+		-- I have no idea why, but not floating constants to
+		-- top level is very bad in some cases.
+                --
+		-- Notably: p_ident in spectral/rewrite
+		-- 	    Changing from "gentle" to "constantsOnly"
+		-- 	    improved rewrite's allocation by 19%, and
+		-- 	    made 0.0% difference to any other nofib
+		-- 	    benchmark
+                --
+                -- Not doing floatOutPartialApplications yet, we'll do
+                -- that later on when we've had a chance to get more
+                -- accurate arity information.  In fact it makes no
+                -- difference at all to performance if we do it here,
+                -- but maybe we save some unnecessary to-and-fro in
+                -- the simplifier.
+
+        runWhen do_float_in CoreDoFloatInwards,
+
+        simpl_phases,
+
+                -- Phase 0: allow all Ids to be inlined now
+                -- This gets foldr inlined before strictness analysis
+
+                -- At least 3 iterations because otherwise we land up with
+                -- huge dead expressions because of an infelicity in the
+                -- simpifier.
+                --      let k = BIG in foldr k z xs
+                -- ==>  let k = BIG in letrec go = \xs -> ...(k x).... in go xs
+                -- ==>  let k = BIG in letrec go = \xs -> ...(BIG x).... in go xs
+                -- Don't stop now!
+        simpl_phase 0 ["main"] (max max_iter 3),
+
+        runWhen strictness (CoreDoPasses [
+                CoreDoStrictness,
+                CoreDoWorkerWrapper,
+                CoreDoGlomBinds,
+                simpl_phase 0 ["post-worker-wrapper"] max_iter
+                ]),
+
+        runWhen full_laziness $
+           CoreDoFloatOutwards FloatOutSwitches {
+                                 floatOutLambdas   = floatLamArgs dflags,
+                                 floatOutConstants = True,
+                                 floatOutPartialApplications = True },
+                -- nofib/spectral/hartel/wang doubles in speed if you
+                -- do full laziness late in the day.  It only happens
+                -- after fusion and other stuff, so the early pass doesn't
+                -- catch it.  For the record, the redex is
+                --        f_el22 (f_el21 r_midblock)
+
+
+        runWhen cse CoreCSE,
+                -- We want CSE to follow the final full-laziness pass, because it may
+                -- succeed in commoning up things floated out by full laziness.
+                -- CSE used to rely on the no-shadowing invariant, but it doesn't any more
+
+        runWhen do_float_in CoreDoFloatInwards,
+
+        maybe_rule_check (Phase 0),
+
+                -- Case-liberation for -O2.  This should be after
+                -- strictness analysis and the simplification which follows it.
+        runWhen liberate_case (CoreDoPasses [
+            CoreLiberateCase,
+            simpl_phase 0 ["post-liberate-case"] max_iter
+            ]),         -- Run the simplifier after LiberateCase to vastly
+                        -- reduce the possiblility of shadowing
+                        -- Reason: see Note [Shadowing] in SpecConstr.lhs
+
+        runWhen spec_constr CoreDoSpecConstr,
+
+        maybe_rule_check (Phase 0),
+
+        -- Final clean-up simplification:
+        simpl_phase 0 ["final"] max_iter
+     ]
+
+-- The core-to-core pass ordering is derived from the DynFlags:
+runWhen :: Bool -> CoreToDo -> CoreToDo
+runWhen True  do_this = do_this
+runWhen False _       = CoreDoNothing
+
+runMaybe :: Maybe a -> (a -> CoreToDo) -> CoreToDo
+runMaybe (Just x) f = f x
+runMaybe Nothing  _ = CoreDoNothing
+
+dumpSimplPhase :: DynFlags -> SimplifierMode -> Bool
+dumpSimplPhase dflags mode
+   | Just spec_string <- shouldDumpSimplPhase dflags
+   = match_spec spec_string
+   | otherwise
+   = dopt Opt_D_verbose_core2core dflags
+
+  where
+    match_spec :: String -> Bool
+    match_spec spec_string 
+      = or $ map (and . map match . split ':') 
+           $ split ',' spec_string
+
+    match :: String -> Bool
+    match "" = True
+    match s  = case reads s of
+                [(n,"")] -> phase_num  n
+                _        -> phase_name s
+
+    phase_num :: Int -> Bool
+    phase_num n = case sm_phase mode of
+                    Phase k -> n == k
+                    _       -> False
+
+    phase_name :: String -> Bool
+    phase_name s = s `elem` sm_names mode
+\end{code}
+
+
+Note [RULEs enabled in SimplGently]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RULES are enabled when doing "gentle" simplification.  Two reasons:
+
+  * We really want the class-op cancellation to happen:
+        op (df d1 d2) --> $cop3 d1 d2
+    because this breaks the mutual recursion between 'op' and 'df'
+
+  * I wanted the RULE
+        lift String ===> ...
+    to work in Template Haskell when simplifying
+    splices, so we get simpler code for literal strings
+
+But watch out: list fusion can prevent floating.  So use phase control
+to switch off those rules until after floating.
+
+
+%************************************************************************
+%*									*
+             Counting and logging
+%*									*
+%************************************************************************
+
+\begin{code}
+verboseSimplStats :: Bool
+verboseSimplStats = opt_PprStyle_Debug		-- For now, anyway
+
+zeroSimplCount	   :: DynFlags -> SimplCount
+isZeroSimplCount   :: SimplCount -> Bool
+pprSimplCount	   :: SimplCount -> SDoc
+doSimplTick, doFreeSimplTick :: Tick -> SimplCount -> SimplCount
+plusSimplCount     :: SimplCount -> SimplCount -> SimplCount
+\end{code}
+
+\begin{code}
+data SimplCount 
+   = VerySimplCount !Int	-- Used when don't want detailed stats
+
+   | SimplCount	{
+	ticks   :: !Int,	-- Total ticks
+	details :: !TickCounts,	-- How many of each type
+
+	n_log	:: !Int,	-- N
+	log1	:: [Tick],	-- Last N events; <= opt_HistorySize, 
+		   		--   most recent first
+	log2	:: [Tick]	-- Last opt_HistorySize events before that
+		   		-- Having log1, log2 lets us accumulate the
+				-- recent history reasonably efficiently
+     }
+
+type TickCounts = Map Tick Int
+
+simplCountN :: SimplCount -> Int
+simplCountN (VerySimplCount n)         = n
+simplCountN (SimplCount { ticks = n }) = n
+
+zeroSimplCount dflags
+		-- This is where we decide whether to do
+		-- the VerySimpl version or the full-stats version
+  | dopt Opt_D_dump_simpl_stats dflags
+  = SimplCount {ticks = 0, details = Map.empty,
+                n_log = 0, log1 = [], log2 = []}
+  | otherwise
+  = VerySimplCount 0
+
+isZeroSimplCount (VerySimplCount n)    	    = n==0
+isZeroSimplCount (SimplCount { ticks = n }) = n==0
+
+doFreeSimplTick tick sc@SimplCount { details = dts } 
+  = sc { details = dts `addTick` tick }
+doFreeSimplTick _ sc = sc 
+
+doSimplTick tick sc@SimplCount { ticks = tks, details = dts, n_log = nl, log1 = l1 }
+  | nl >= opt_HistorySize = sc1 { n_log = 1, log1 = [tick], log2 = l1 }
+  | otherwise		  = sc1 { n_log = nl+1, log1 = tick : l1 }
+  where
+    sc1 = sc { ticks = tks+1, details = dts `addTick` tick }
+
+doSimplTick _ (VerySimplCount n) = VerySimplCount (n+1)
+
+
+-- Don't use Map.unionWith because that's lazy, and we want to 
+-- be pretty strict here!
+addTick :: TickCounts -> Tick -> TickCounts
+addTick fm tick = case Map.lookup tick fm of
+			Nothing -> Map.insert tick 1 fm
+			Just n  -> n1 `seq` Map.insert tick n1 fm
+				where
+				   n1 = n+1
+
+
+plusSimplCount sc1@(SimplCount { ticks = tks1, details = dts1 })
+	       sc2@(SimplCount { ticks = tks2, details = dts2 })
+  = log_base { ticks = tks1 + tks2, details = Map.unionWith (+) dts1 dts2 }
+  where
+	-- A hackish way of getting recent log info
+    log_base | null (log1 sc2) = sc1	-- Nothing at all in sc2
+	     | null (log2 sc2) = sc2 { log2 = log1 sc1 }
+	     | otherwise       = sc2
+
+plusSimplCount (VerySimplCount n) (VerySimplCount m) = VerySimplCount (n+m)
+plusSimplCount _                  _                  = panic "plusSimplCount"
+       -- We use one or the other consistently
+
+pprSimplCount (VerySimplCount n) = ptext (sLit "Total ticks:") <+> int n
+pprSimplCount (SimplCount { ticks = tks, details = dts, log1 = l1, log2 = l2 })
+  = vcat [ptext (sLit "Total ticks:    ") <+> int tks,
+	  blankLine,
+	  pprTickCounts (Map.toList dts),
+	  if verboseSimplStats then
+		vcat [blankLine,
+		      ptext (sLit "Log (most recent first)"),
+		      nest 4 (vcat (map ppr l1) $$ vcat (map ppr l2))]
+	  else empty
+    ]
+
+pprTickCounts :: [(Tick,Int)] -> SDoc
+pprTickCounts [] = empty
+pprTickCounts ((tick1,n1):ticks)
+  = vcat [int tot_n <+> text (tickString tick1),
+	  pprTCDetails real_these,
+	  pprTickCounts others
+    ]
+  where
+    tick1_tag		= tickToTag tick1
+    (these, others)	= span same_tick ticks
+    real_these		= (tick1,n1):these
+    same_tick (tick2,_) = tickToTag tick2 == tick1_tag
+    tot_n		= sum [n | (_,n) <- real_these]
+
+pprTCDetails :: [(Tick, Int)] -> SDoc
+pprTCDetails ticks
+  = nest 4 (vcat [int n <+> pprTickCts tick | (tick,n) <- ticks])
+\end{code}
+
+
+\begin{code}
+data Tick
+  = PreInlineUnconditionally	Id
+  | PostInlineUnconditionally	Id
+
+  | UnfoldingDone    		Id
+  | RuleFired			FastString	-- Rule name
+
+  | LetFloatFromLet
+  | EtaExpansion		Id	-- LHS binder
+  | EtaReduction		Id	-- Binder on outer lambda
+  | BetaReduction		Id	-- Lambda binder
+
+
+  | CaseOfCase			Id	-- Bndr on *inner* case
+  | KnownBranch			Id	-- Case binder
+  | CaseMerge			Id	-- Binder on outer case
+  | AltMerge			Id	-- Case binder
+  | CaseElim			Id	-- Case binder
+  | CaseIdentity		Id	-- Case binder
+  | FillInCaseDefault		Id	-- Case binder
+
+  | BottomFound		
+  | SimplifierDone		-- Ticked at each iteration of the simplifier
+
+instance Outputable Tick where
+  ppr tick = text (tickString tick) <+> pprTickCts tick
+
+instance Eq Tick where
+  a == b = case a `cmpTick` b of
+           EQ -> True
+           _ -> False
+
+instance Ord Tick where
+  compare = cmpTick
+
+tickToTag :: Tick -> Int
+tickToTag (PreInlineUnconditionally _)	= 0
+tickToTag (PostInlineUnconditionally _)	= 1
+tickToTag (UnfoldingDone _)		= 2
+tickToTag (RuleFired _)			= 3
+tickToTag LetFloatFromLet		= 4
+tickToTag (EtaExpansion _)		= 5
+tickToTag (EtaReduction _)		= 6
+tickToTag (BetaReduction _)		= 7
+tickToTag (CaseOfCase _)		= 8
+tickToTag (KnownBranch _)		= 9
+tickToTag (CaseMerge _)			= 10
+tickToTag (CaseElim _)			= 11
+tickToTag (CaseIdentity _)		= 12
+tickToTag (FillInCaseDefault _)		= 13
+tickToTag BottomFound			= 14
+tickToTag SimplifierDone		= 16
+tickToTag (AltMerge _)			= 17
+
+tickString :: Tick -> String
+tickString (PreInlineUnconditionally _)	= "PreInlineUnconditionally"
+tickString (PostInlineUnconditionally _)= "PostInlineUnconditionally"
+tickString (UnfoldingDone _)		= "UnfoldingDone"
+tickString (RuleFired _)		= "RuleFired"
+tickString LetFloatFromLet		= "LetFloatFromLet"
+tickString (EtaExpansion _)		= "EtaExpansion"
+tickString (EtaReduction _)		= "EtaReduction"
+tickString (BetaReduction _)		= "BetaReduction"
+tickString (CaseOfCase _)		= "CaseOfCase"
+tickString (KnownBranch _)		= "KnownBranch"
+tickString (CaseMerge _)		= "CaseMerge"
+tickString (AltMerge _)			= "AltMerge"
+tickString (CaseElim _)			= "CaseElim"
+tickString (CaseIdentity _)		= "CaseIdentity"
+tickString (FillInCaseDefault _)	= "FillInCaseDefault"
+tickString BottomFound			= "BottomFound"
+tickString SimplifierDone		= "SimplifierDone"
+
+pprTickCts :: Tick -> SDoc
+pprTickCts (PreInlineUnconditionally v)	= ppr v
+pprTickCts (PostInlineUnconditionally v)= ppr v
+pprTickCts (UnfoldingDone v)		= ppr v
+pprTickCts (RuleFired v)		= ppr v
+pprTickCts LetFloatFromLet		= empty
+pprTickCts (EtaExpansion v)		= ppr v
+pprTickCts (EtaReduction v)		= ppr v
+pprTickCts (BetaReduction v)		= ppr v
+pprTickCts (CaseOfCase v)		= ppr v
+pprTickCts (KnownBranch v)		= ppr v
+pprTickCts (CaseMerge v)		= ppr v
+pprTickCts (AltMerge v)			= ppr v
+pprTickCts (CaseElim v)			= ppr v
+pprTickCts (CaseIdentity v)		= ppr v
+pprTickCts (FillInCaseDefault v)	= ppr v
+pprTickCts _    			= empty
+
+cmpTick :: Tick -> Tick -> Ordering
+cmpTick a b = case (tickToTag a `compare` tickToTag b) of
+		GT -> GT
+		EQ -> cmpEqTick a b
+		LT -> LT
+
+cmpEqTick :: Tick -> Tick -> Ordering
+cmpEqTick (PreInlineUnconditionally a)	(PreInlineUnconditionally b)	= a `compare` b
+cmpEqTick (PostInlineUnconditionally a)	(PostInlineUnconditionally b)	= a `compare` b
+cmpEqTick (UnfoldingDone a)		(UnfoldingDone b)		= a `compare` b
+cmpEqTick (RuleFired a)			(RuleFired b)			= a `compare` b
+cmpEqTick (EtaExpansion a)		(EtaExpansion b)		= a `compare` b
+cmpEqTick (EtaReduction a)		(EtaReduction b)		= a `compare` b
+cmpEqTick (BetaReduction a)		(BetaReduction b)		= a `compare` b
+cmpEqTick (CaseOfCase a)		(CaseOfCase b)			= a `compare` b
+cmpEqTick (KnownBranch a)		(KnownBranch b)			= a `compare` b
+cmpEqTick (CaseMerge a)			(CaseMerge b)			= a `compare` b
+cmpEqTick (AltMerge a)			(AltMerge b)			= a `compare` b
+cmpEqTick (CaseElim a)			(CaseElim b)			= a `compare` b
+cmpEqTick (CaseIdentity a)		(CaseIdentity b)		= a `compare` b
+cmpEqTick (FillInCaseDefault a)		(FillInCaseDefault b)		= a `compare` b
+cmpEqTick _     			_     				= EQ
 \end{code}
 
 
@@ -126,9 +827,8 @@ dumpAndLint dump dflags pass_name dump_flag binds rules
 %************************************************************************
 
 \begin{code}
-data CoreState = CoreState {
-        cs_uniq_supply :: UniqSupply,
-        cs_ann_env :: AnnEnv
+newtype CoreState = CoreState {
+        cs_uniq_supply :: UniqSupply
 }
 
 data CoreReader = CoreReader {
@@ -187,13 +887,12 @@ instance MonadUnique CoreM where
         return us1
 
 runCoreM :: HscEnv
-         -> AnnEnv
          -> RuleBase
          -> UniqSupply
          -> Module
          -> CoreM a
          -> IO (a, SimplCount)
-runCoreM hsc_env ann_env rule_base us mod m =
+runCoreM hsc_env rule_base us mod m =
         liftM extract $ runIOEnv reader $ unCoreM m state
   where
     reader = CoreReader {
@@ -202,8 +901,7 @@ runCoreM hsc_env ann_env rule_base us mod m =
             cr_module = mod
         }
     state = CoreState { 
-            cs_uniq_supply = us,
-            cs_ann_env = ann_env
+            cs_uniq_supply = us
         }
 
     extract :: (a, CoreState, CoreWriter) -> (a, SimplCount)
@@ -268,9 +966,6 @@ liftIOWithCount what = liftIO what >>= (\(count, x) -> addSimplCount count >> re
 getHscEnv :: CoreM HscEnv
 getHscEnv = read cr_hsc_env
 
-getAnnEnv :: CoreM AnnEnv
-getAnnEnv = getS cs_ann_env
-
 getRuleBase :: CoreM RuleBase
 getRuleBase = read cr_rule_base
 
@@ -302,38 +997,45 @@ getOrigNameCache = do
 %************************************************************************
 
 \begin{code}
-
--- | Find all the annotations we currently know about for the given target. Note that no
--- annotations will be returned if we haven't loaded information about the particular target
--- you are inquiring about: by default, only those modules that have been imported by the
--- program being compiled will have been loaded in this way.
+-- | Get all annotations of a given type. This happens lazily, that is
+-- no deserialization will take place until the [a] is actually demanded and
+-- the [a] can also be empty (the UniqFM is not filtered).
 --
--- To load the information from additional modules, you can use the functions 'DynamicLoading.forceLoadModuleInterfaces'
--- and 'DynamicLoading.forceLoadNameModuleInterface', but be aware that doing this indiscriminantly
--- will impose a performance penalty.
+-- This should be done once at the start of a Core-to-Core pass that uses
+-- annotations.
 --
--- If no deserialization function is supplied, only transient annotations will be returned.
-findAnnotations :: Typeable a => ([Word8] -> a) -> CoreAnnTarget -> CoreM [a]
-findAnnotations deserialize target = do
-     ann_env <- getAnnEnv
-     return (findAnns deserialize ann_env target)
-
--- | Deserialize all annotations of a given type. This happens lazily, that is
---   no deserialization will take place until the [a] is actually demanded and
---   the [a] can also be empty (the UniqFM is not filtered).
-deserializeAnnotations :: Typeable a => ([Word8] -> a) -> CoreM (UniqFM [a])
-deserializeAnnotations deserialize = do
-     ann_env <- getAnnEnv
+-- See Note [Annotations]
+getAnnotations :: Typeable a => ([Word8] -> a) -> ModGuts -> CoreM (UniqFM [a])
+getAnnotations deserialize guts = do
+     hsc_env <- getHscEnv
+     ann_env <- liftIO $ prepareAnnotations hsc_env (Just guts)
      return (deserializeAnns deserialize ann_env)
 
-addAnnotation :: Typeable a => (a -> [Word8]) -> CoreAnnTarget -> a -> CoreM ()
-addAnnotation serialize target what = addAnnotationToEnv $ Annotation { ann_target = target, ann_value = toSerialized serialize what }
-
-addAnnotationToEnv :: Annotation -> CoreM ()
-addAnnotationToEnv annotation = modifyS (\state -> state { cs_ann_env = extendAnnEnvList (cs_ann_env state) [annotation] })
-
+-- | Get at most one annotation of a given type per Unique.
+getFirstAnnotations :: Typeable a => ([Word8] -> a) -> ModGuts -> CoreM (UniqFM a)
+getFirstAnnotations deserialize guts
+  = liftM (mapUFM head . filterUFM (not . null))
+  $ getAnnotations deserialize guts
+  
 \end{code}
 
+Note [Annotations]
+~~~~~~~~~~~~~~~~~~
+A Core-to-Core pass that wants to make use of annotations calls
+getAnnotations or getFirstAnnotations at the beginning to obtain a UniqFM with
+annotations of a specific type. This produces all annotations from interface
+files read so far. However, annotations from interface files read during the
+pass will not be visible until getAnnotations is called again. This is similar
+to how rules work and probably isn't too bad.
+
+The current implementation could be optimised a bit: when looking up
+annotations for a thing from the HomePackageTable, we could search directly in
+the module where the thing is defined rather than building one UniqFM which
+contains all annotations we know of. This would work because annotations can
+only be given to things defined in the same module. However, since we would
+only want to deserialise every annotation once, we would have to build a cache
+for every module in the HTP. In the end, it's probably not worth it as long as
+we aren't using annotations heavily.
 
 %************************************************************************
 %*									*
@@ -426,3 +1128,29 @@ thNameToGhcName th_name = do
     liftIO $ initTcForLookup hsc_env (lookupThName_maybe th_name)
 #endif
 \end{code}
+
+\begin{code}
+updNameCache' :: (NameCache -> (NameCache, a)) -> CoreM a
+updNameCache' upd_fn = do
+  HscEnv { hsc_NC = nc_var } <- getHscEnv
+  r  <- liftIO $ atomicModifyIORef nc_var upd_fn
+  r' <- liftIO $ readIORef nc_var
+  _  <- liftIO $ evaluate r'
+  return r
+
+-- cut-and-pasted from IfaceEnv, where it lives in the TcRn monad rather than CoreM
+lookupOrigCoreM :: Module -> OccName -> CoreM Name
+lookupOrigCoreM mod occ
+  = do 	{ mod `seq` occ `seq` return ()	
+        ; updNameCache' $ \name_cache ->
+            case lookupOrigNameCache (nsNames name_cache) mod occ of {
+	      Just name -> (name_cache, name);
+	      Nothing   ->
+              case takeUniqFromSupply (nsUniqs name_cache) of {
+              (uniq, us) ->
+                  let
+                    name      = mkExternalName uniq mod occ noSrcSpan
+                    new_cache = extendNameCache (nsNames name_cache) mod occ name
+                  in (name_cache{ nsUniqs = us, nsNames = new_cache }, name)
+    }}}
+\end{code}
\ No newline at end of file