From: Jose Pedro Magalhaes <jpm@cs.uu.nl>
Date: Fri, 20 May 2011 17:17:13 +0000 (+0200)
Subject: Merge branch 'ghc-generics' of http://darcs.haskell.org/ghc into ghc-generics
X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=6ad311b7965a7af86f3b931b134215dff76f5fbb;hp=d7fb8d371d3228774331a67db8da805b2d68f1c4

Merge branch 'ghc-generics' of darcs.haskell.org/ghc into ghc-generics
---

diff --git a/compiler/basicTypes/BasicTypes.lhs b/compiler/basicTypes/BasicTypes.lhs
index f077882..7ea66e1 100644
--- a/compiler/basicTypes/BasicTypes.lhs
+++ b/compiler/basicTypes/BasicTypes.lhs
@@ -72,13 +72,16 @@ module BasicTypes(
         inlinePragmaActivation, inlinePragmaRuleMatchInfo,
         setInlinePragmaActivation, setInlinePragmaRuleMatchInfo,
 
-	SuccessFlag(..), succeeded, failed, successIf
+	SuccessFlag(..), succeeded, failed, successIf,
+	
+	FractionalLit(..), negateFractionalLit, integralFractionalLit
    ) where
 
 import FastString
 import Outputable
 
 import Data.Data hiding (Fixity)
+import Data.Function (on)
 \end{code}
 
 %************************************************************************
@@ -862,3 +865,36 @@ isEarlyActive (ActiveBefore {}) = True
 isEarlyActive _		        = False
 \end{code}
 
+
+
+\begin{code}
+-- Used (instead of Rational) to represent exactly the floating point literal that we
+-- encountered in the user's source program. This allows us to pretty-print exactly what
+-- the user wrote, which is important e.g. for floating point numbers that can't represented
+-- as Doubles (we used to via Double for pretty-printing). See also #2245.
+data FractionalLit
+  = FL { fl_text :: String         -- How the value was written in the source
+       , fl_value :: Rational      -- Numeric value of the literal
+       }
+  deriving (Data, Typeable, Show)
+  -- The Show instance is required for the derived Lexer.x:Token instance when DEBUG is on
+
+negateFractionalLit :: FractionalLit -> FractionalLit
+negateFractionalLit (FL { fl_text = '-':text, fl_value = value }) = FL { fl_text = text, fl_value = negate value }
+negateFractionalLit (FL { fl_text = text, fl_value = value }) = FL { fl_text = '-':text, fl_value = negate value }
+
+integralFractionalLit :: Integer -> FractionalLit
+integralFractionalLit i = FL { fl_text = show i, fl_value = fromInteger i }
+
+-- Comparison operations are needed when grouping literals
+-- for compiling pattern-matching (module MatchLit)
+
+instance Eq FractionalLit where
+  (==) = (==) `on` fl_value
+
+instance Ord FractionalLit where
+  compare = compare `on` fl_value
+
+instance Outputable FractionalLit where
+  ppr = text . fl_text
+\end{code}
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index 54b4b11..a6b215b 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -11,7 +11,7 @@
 module Cmm
   ( CmmGraph, GenCmmGraph(..), CmmBlock
   , CmmStackInfo(..), CmmTopInfo(..), Cmm, CmmTop
-  , CmmReplGraph, CmmFwdRewrite, CmmBwdRewrite
+  , CmmReplGraph, GenCmmReplGraph, CmmFwdRewrite, CmmBwdRewrite
 
   , modifyGraph
   , lastNode, replaceLastNode, insertBetween
@@ -46,7 +46,8 @@ type CmmGraph = GenCmmGraph CmmNode
 data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
 type CmmBlock = Block CmmNode C C
 
-type CmmReplGraph e x = FuelUniqSM (Maybe (Graph CmmNode e x))
+type CmmReplGraph e x = GenCmmReplGraph CmmNode e x
+type GenCmmReplGraph n e x = FuelUniqSM (Maybe (Graph n e x))
 type CmmFwdRewrite f = FwdRewrite FuelUniqSM CmmNode f
 type CmmBwdRewrite f = BwdRewrite FuelUniqSM CmmNode f
 
diff --git a/compiler/cmm/CmmCPS.hs b/compiler/cmm/CmmCPS.hs
index 6e97100..35eabb3 100644
--- a/compiler/cmm/CmmCPS.hs
+++ b/compiler/cmm/CmmCPS.hs
@@ -67,49 +67,47 @@ mutable reference cells in an 'HscEnv' and are
 global to one compiler session.
 -}
 
+-- EZY: It might be helpful to have an easy way of dumping the "pre"
+-- input for any given phase, besides just turning it all on with
+-- -ddump-cmmz
+
 cpsTop :: HscEnv -> CmmTop -> IO ([(CLabel, CAFSet)], [(CAFSet, CmmTop)])
 cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
-       -- Why bother doing it this early?
-       -- g <- dual_rewrite run Opt_D_dump_cmmz "spills and reloads"
-       --                       (dualLivenessWithInsertion callPPs) g
-       -- g <- run $ insertLateReloads g -- Duplicate reloads just before uses
-       -- g <- dual_rewrite runOptimization Opt_D_dump_cmmz "Dead Assignment Elimination"
-       --                   (removeDeadAssignmentsAndReloads callPPs) g
-       dump Opt_D_dump_cmmz "Pre common block elimination" g
-       g <- return $ elimCommonBlocks g
-       dump Opt_D_dump_cmmz "Post common block elimination" g
+       -- Why bother doing these early: dualLivenessWithInsertion,
+       -- insertLateReloads, rewriteAssignments?
 
+       ----------- Eliminate common blocks -------------------
+       g <- return $ elimCommonBlocks g
+       dump Opt_D_dump_cmmz_cbe "Post common block elimination" g
        -- Any work storing block Labels must be performed _after_ elimCommonBlocks
 
        ----------- Proc points -------------------
        let callPPs = callProcPoints g
        procPoints <- run $ minimalProcPointSet callPPs g
        g <- run $ addProcPointProtocols callPPs procPoints g
-       dump Opt_D_dump_cmmz "Post Proc Points Added" g
+       dump Opt_D_dump_cmmz_proc "Post Proc Points Added" g
 
        ----------- Spills and reloads -------------------
-       g     <- 
-              -- pprTrace "pre Spills" (ppr g) $
-                dual_rewrite run Opt_D_dump_cmmz "spills and reloads"
-                             (dualLivenessWithInsertion procPoints) g
-                    -- Insert spills at defns; reloads at return points
-       g     <-
-              -- pprTrace "pre insertLateReloads" (ppr g) $
-                runOptimization $ insertLateReloads g -- Duplicate reloads just before uses
-       dump Opt_D_dump_cmmz "Post late reloads" g
-       g     <-
-               -- pprTrace "post insertLateReloads" (ppr g) $
-                dual_rewrite runOptimization Opt_D_dump_cmmz "Dead Assignment Elimination"
-                                        (removeDeadAssignmentsAndReloads procPoints) g
-                    -- Remove redundant reloads (and any other redundant asst)
-
-       ----------- Debug only: add code to put zero in dead stack slots----
-       -- Debugging: stubbing slots on death can cause crashes early
-       g <- -- trace "post dead-assign elim" $
-            if opt_StubDeadValues then run $ stubSlotsOnDeath g else return g
+       g <- run $ dualLivenessWithInsertion procPoints g
+       dump Opt_D_dump_cmmz_spills "Post spills and reloads" g
 
+       ----------- Sink and inline assignments -------------------
+       g <- runOptimization $ rewriteAssignments g
+       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
+
+       ----------- Eliminate dead assignments -------------------
+       -- Remove redundant reloads (and any other redundant asst)
+       g <- runOptimization $ removeDeadAssignmentsAndReloads procPoints g
+       dump Opt_D_dump_cmmz_dead "Post Dead Assignment Elimination" g
+
+       ----------- Zero dead stack slots (Debug only) ---------------
+       -- Debugging: stubbing slots on death can cause crashes early
+       g <- if opt_StubDeadValues
+                then run $ stubSlotsOnDeath g
+                else return g
+       dump Opt_D_dump_cmmz_stub "Post stub dead stack slots" g
 
        --------------- Stack layout ----------------
        slotEnv <- run $ liveSlotAnal g
@@ -120,16 +118,16 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
 
        ------------  Manifest the stack pointer --------
        g  <- run $ manifestSP spEntryMap areaMap entry_off g
-       dump Opt_D_dump_cmmz "after manifestSP" g
+       dump Opt_D_dump_cmmz_sp "Post manifestSP" g
        -- UGH... manifestSP can require updates to the procPointMap.
        -- We can probably do something quicker here for the update...
 
        ------------- Split into separate procedures ------------
        procPointMap  <- run $ procPointAnalysis procPoints g
-       dump Opt_D_dump_cmmz "procpoint map" procPointMap
+       dump Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
        gs <- run $ splitAtProcPoints l callPPs procPoints procPointMap
                                        (CmmProc h l g)
-       mapM_ (dump Opt_D_dump_cmmz "after splitting") gs
+       mapM_ (dump Opt_D_dump_cmmz_split "Post splitting") gs
 
        ------------- More CAFs and foreign calls ------------
        cafEnv <- run $ cafAnal g
@@ -137,30 +135,29 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
        mbpprTrace "localCAFs" (ppr localCAFs) $ return ()
 
        gs <- run $ mapM (lowerSafeForeignCalls areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz "after lowerSafeForeignCalls") gs
+       mapM_ (dump Opt_D_dump_cmmz_lower "Post lowerSafeForeignCalls") gs
 
        -- NO MORE GRAPH TRANSFORMATION AFTER HERE -- JUST MAKING INFOTABLES
-       let gs' = map (setInfoTableStackMap slotEnv areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz "after setInfoTableStackMap") gs'
-       let gs'' = map (bundleCAFs cafEnv) gs'
-       mapM_ (dump Opt_D_dump_cmmz "after bundleCAFs") gs''
-       return (localCAFs, gs'')
+       gs <- return $ map (setInfoTableStackMap slotEnv areaMap) gs
+       mapM_ (dump Opt_D_dump_cmmz_info "after setInfoTableStackMap") gs
+       gs <- return $ map (bundleCAFs cafEnv) gs
+       mapM_ (dump Opt_D_dump_cmmz_cafs "after bundleCAFs") gs
+       return (localCAFs, gs)
   where dflags = hsc_dflags hsc_env
         mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
-        dump f txt g = dumpIfSet_dyn dflags f txt (ppr g)
+        dump f txt g = do
+            -- ToDo: No easy way of say "dump all the cmmz, *and* split
+            -- them into files."  Also, -ddump-cmmz doesn't play nicely
+            -- with -ddump-to-file, since the headers get omitted.
+            dumpIfSet_dyn dflags f txt (ppr g)
+            when (not (dopt f dflags)) $
+                dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (ppr g)
         -- Runs a required transformation/analysis
         run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
         -- Runs an optional transformation/analysis (and should
         -- thus be subject to optimization fuel)
         runOptimization = runFuelIO (hsc_OptFuel hsc_env)
 
-        -- pass 'run' or 'runOptimization' for 'r'
-        dual_rewrite r flag txt pass g =
-          do dump flag ("Pre " ++ txt)  g
-             g <- r $ pass g
-             dump flag ("Post " ++ txt) $ g
-             return g
-
 -- This probably belongs in CmmBuildInfoTables?
 -- We're just finishing the job here: once we know what CAFs are defined
 -- in non-static closures, we can build the SRTs.
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index ee948fe..7d50d9a 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -10,7 +10,7 @@
 module CmmNode
   ( CmmNode(..)
   , UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..)
-  , mapExp, mapExpDeep, foldExp, foldExpDeep
+  , mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf
   )
 where
 
@@ -46,7 +46,9 @@ data CmmNode e x where
       CmmActuals ->               -- zero or more arguments
       CmmNode O O
       -- Semantics: kills only result regs; all other regs (both GlobalReg
-      --            and LocalReg) are preserved
+      --            and LocalReg) are preserved.  But there is a current
+      --            bug for what can be put in arguments, see
+      --            Note [Register Parameter Passing]
 
   CmmBranch :: Label -> CmmNode O C  -- Goto another block in the same procedure
 
@@ -73,7 +75,8 @@ data CmmNode e x where
 -- moment of the call.  Later stages can use this to give liveness
 -- everywhere, which in turn guides register allocation.
 -- It is the companion of cml_args; cml_args says which stack words
--- hold parameters, while cml_arg_regs says which global regs hold parameters
+-- hold parameters, while cml_arg_regs says which global regs hold parameters.
+-- But do note [Register parameter passing]
 
       cml_args :: ByteOff,
           -- Byte offset, from the *old* end of the Area associated with
@@ -103,7 +106,7 @@ data CmmNode e x where
   		    		-- Always the last node of a block
       tgt   :: ForeignTarget,   -- call target and convention
       res   :: CmmFormals,      -- zero or more results
-      args  :: CmmActuals,      -- zero or more arguments
+      args  :: CmmActuals,      -- zero or more arguments; see Note [Register parameter passing]
       succ  :: Label,           -- Label of continuation
       updfr :: UpdFrameOffset,  -- where the update frame is (for building infotable)
       intrbl:: Bool             -- whether or not the call is interruptible
@@ -113,9 +116,11 @@ data CmmNode e x where
 ~~~~~~~~~~~~~~~~~~~~~~~
 A CmmUnsafeForeignCall is used for *unsafe* foreign calls;
 a CmmForeignCall call is used for *safe* foreign calls.
-Unsafe ones are easy: think of them as a "fat machine instruction".
-In particular, they do *not* kill all live registers (there was a bit
-of code in GHC that conservatively assumed otherwise.)
+
+Unsafe ones are mostly easy: think of them as a "fat machine
+instruction".  In particular, they do *not* kill all live registers,
+just the registers they return to (there was a bit of code in GHC that
+conservatively assumed otherwise.)  However, see [Register parameter passing].
 
 Safe ones are trickier.  A safe foreign call 
      r = f(x)
@@ -138,6 +143,21 @@ constructors do *not* (currently) know the foreign call conventions.
 Note that a safe foreign call needs an info table.
 -}
 
+{- Note [Register parameter passing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+On certain architectures, some registers are utilized for parameter
+passing in the C calling convention.  For example, in x86-64 Linux
+convention, rdi, rsi, rdx and rcx (as well as r8 and r9) may be used for
+argument passing.  These are registers R3-R6, which our generated
+code may also be using; as a result, it's necessary to save these
+values before doing a foreign call.  This is done during initial
+code generation in callerSaveVolatileRegs in StgCmmUtils.hs.  However,
+one result of doing this is that the contents of these registers
+may mysteriously change if referenced inside the arguments.  This
+is dangerous, so you'll need to disable inlining much in the same
+way is done in cmm/CmmOpt.hs currently.  We should fix this!
+-}
+
 ---------------------------------------------
 -- Eq instance of CmmNode
 -- It is a shame GHC cannot infer it by itself :(
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
index 17364ad..2dcfb02 100644
--- a/compiler/cmm/CmmSpillReload.hs
+++ b/compiler/cmm/CmmSpillReload.hs
@@ -1,7 +1,8 @@
-{-# LANGUAGE GADTs,NoMonoLocalBinds #-}
+{-# LANGUAGE GADTs, NoMonoLocalBinds, FlexibleContexts, ViewPatterns #-}
 -- Norman likes local bindings
 -- If this module lives on I'd like to get rid of this flag in due course
 
+{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
 {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
 #if __GLASGOW_HASKELL__ >= 701
 -- GHC 7.0.1 improved incomplete pattern warnings with GADTs
@@ -14,9 +15,7 @@ module CmmSpillReload
   --, insertSpillsAndReloads  --- XXX todo check live-in at entry against formals
   , dualLivenessWithInsertion
 
-  , availRegsLattice
-  , cmmAvailableReloads
-  , insertLateReloads
+  , rewriteAssignments
   , removeDeadAssignmentsAndReloads
   )
 where
@@ -26,13 +25,16 @@ import Cmm
 import CmmExpr
 import CmmLive
 import OptimizationFuel
+import StgCmmUtils
 
 import Control.Monad
 import Outputable hiding (empty)
 import qualified Outputable as PP
 import UniqSet
+import UniqFM
+import Unique
 
-import Compiler.Hoopl
+import Compiler.Hoopl hiding (Unique)
 import Data.Maybe
 import Prelude hiding (succ, zip)
 
@@ -172,11 +174,6 @@ insertSpillAndReloadRewrites graph procPoints = deepBwdRw3 first middle nothing
                                                text "after"{-, ppr m-}]) $
                    Just $ mkMiddles $ [m, spill reg]
               else Nothing
-          middle m@(CmmUnsafeForeignCall _ fs _) live = return $
-            case map spill  (filter (flip elemRegSet (on_stack live)) fs) ++
-                 map reload (uniqSetToList (kill fs (in_regs live))) of
-              []      -> Nothing
-              reloads -> Just $ mkMiddles (m : reloads)
           middle _ _ = return Nothing
 
           nothing _ _ = return Nothing
@@ -188,91 +185,6 @@ spill, reload :: LocalReg -> CmmNode O O
 spill  r = CmmStore  (regSlot r) (CmmReg $ CmmLocal r)
 reload r = CmmAssign (CmmLocal r) (CmmLoad (regSlot r) $ localRegType r)
 
-----------------------------------------------------------------
---- sinking reloads
-
--- The idea is to compute at each point the set of registers such that
--- on every path to the point, the register is defined by a Reload
--- instruction.  Then, if a use appears at such a point, we can safely
--- insert a Reload right before the use.  Finally, we can eliminate
--- the early reloads along with other dead assignments.
-
-data AvailRegs = UniverseMinus RegSet
-               | AvailRegs     RegSet
-
-
-availRegsLattice :: DataflowLattice AvailRegs
-availRegsLattice = DataflowLattice "register gotten from reloads" empty add
-    where empty = UniverseMinus emptyRegSet
-          -- | compute in the Tx monad to track whether anything has changed
-          add _ (OldFact old) (NewFact new) =
-            if join `smallerAvail` old then (SomeChange, join) else (NoChange, old)
-            where join = interAvail new old
-
-
-interAvail :: AvailRegs -> AvailRegs -> AvailRegs
-interAvail (UniverseMinus s) (UniverseMinus s') = UniverseMinus (s `plusRegSet`  s')
-interAvail (AvailRegs     s) (AvailRegs     s') = AvailRegs (s `timesRegSet` s')
-interAvail (AvailRegs     s) (UniverseMinus s') = AvailRegs (s  `minusRegSet` s')
-interAvail (UniverseMinus s) (AvailRegs     s') = AvailRegs (s' `minusRegSet` s )
-
-smallerAvail :: AvailRegs -> AvailRegs -> Bool
-smallerAvail (AvailRegs     _) (UniverseMinus _)  = True
-smallerAvail (UniverseMinus _) (AvailRegs     _)  = False
-smallerAvail (AvailRegs     s) (AvailRegs    s')  = sizeUniqSet s < sizeUniqSet s'
-smallerAvail (UniverseMinus s) (UniverseMinus s') = sizeUniqSet s > sizeUniqSet s'
-
-extendAvail :: AvailRegs -> LocalReg -> AvailRegs
-extendAvail (UniverseMinus s) r = UniverseMinus (deleteFromRegSet s r)
-extendAvail (AvailRegs     s) r = AvailRegs (extendRegSet s r)
-
-delFromAvail :: AvailRegs -> LocalReg -> AvailRegs
-delFromAvail (UniverseMinus s) r = UniverseMinus (extendRegSet s r)
-delFromAvail (AvailRegs     s) r = AvailRegs (deleteFromRegSet s r)
-
-elemAvail :: AvailRegs -> LocalReg -> Bool
-elemAvail (UniverseMinus s) r = not $ elemRegSet r s
-elemAvail (AvailRegs     s) r = elemRegSet r s
-
-cmmAvailableReloads :: CmmGraph -> FuelUniqSM (BlockEnv AvailRegs)
-cmmAvailableReloads g =
-  liftM snd $ dataflowPassFwd g [(g_entry g, fact_bot availRegsLattice)] $
-                              analFwd availRegsLattice availReloadsTransfer
-
-availReloadsTransfer :: FwdTransfer CmmNode AvailRegs
-availReloadsTransfer = mkFTransfer3 (flip const) middleAvail ((mkFactBase availRegsLattice .) . lastAvail)
-
-middleAvail :: CmmNode O O -> AvailRegs -> AvailRegs
-middleAvail (CmmAssign (CmmLocal r) (CmmLoad l _)) avail
-               | l `isStackSlotOf` r = extendAvail avail r
-middleAvail (CmmAssign lhs _)        avail = foldRegsDefd delFromAvail avail lhs
-middleAvail (CmmStore l (CmmReg (CmmLocal r))) avail
-               | l `isStackSlotOf` r = avail
-middleAvail (CmmStore (CmmStackSlot (RegSlot r) _) _) avail = delFromAvail avail r
-middleAvail (CmmStore {})            avail = avail
-middleAvail (CmmUnsafeForeignCall {}) _    = AvailRegs emptyRegSet
-middleAvail (CmmComment {})          avail = avail
-
-lastAvail :: CmmNode O C -> AvailRegs -> [(Label, AvailRegs)]
-lastAvail (CmmCall _ (Just k) _ _ _) _ = [(k, AvailRegs emptyRegSet)]
-lastAvail (CmmForeignCall {succ=k})  _ = [(k, AvailRegs emptyRegSet)]
-lastAvail l avail = map (\id -> (id, avail)) $ successors l
-
-insertLateReloads :: CmmGraph -> FuelUniqSM CmmGraph
-insertLateReloads g =
-  liftM fst $ dataflowPassFwd g [(g_entry g, fact_bot availRegsLattice)] $
-                              analRewFwd availRegsLattice availReloadsTransfer rewrites
-  where rewrites = mkFRewrite3 first middle last
-        first _ _ = return Nothing
-        middle m avail = return $ maybe_reload_before avail m (mkMiddle m)
-        last   l avail = return $ maybe_reload_before avail l (mkLast l)
-        maybe_reload_before avail node tail =
-            let used = filterRegsUsed (elemAvail avail) node
-            in  if isEmptyUniqSet used then Nothing
-                                       else Just $ reloadTail used tail
-        reloadTail regset t = foldl rel t $ uniqSetToList regset
-          where rel t r = mkMiddle (reload r) <*> t
-
 removeDeadAssignmentsAndReloads :: BlockSet -> CmmGraph -> FuelUniqSM CmmGraph
 removeDeadAssignmentsAndReloads procPoints g =
    liftM fst $ dataflowPassBwd g [] $ analRewBwd dualLiveLattice
@@ -283,10 +195,464 @@ removeDeadAssignmentsAndReloads procPoints g =
          -- but GHC panics while compiling, see bug #4045.
          middle :: CmmNode O O -> Fact O DualLive -> CmmReplGraph O O
          middle (CmmAssign (CmmLocal reg') _) live | not (reg' `elemRegSet` in_regs live) = return $ Just emptyGraph
+         -- XXX maybe this should be somewhere else...
+         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs = return $ Just emptyGraph
+         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs = return $ Just emptyGraph
          middle _ _ = return Nothing
 
          nothing _ _ = return Nothing
 
+----------------------------------------------------------------
+--- Usage information
+
+-- We decorate all register assignments with usage information,
+-- that is, the maximum number of times the register is referenced
+-- while it is live along all outgoing control paths.  There are a few
+-- subtleties here:
+--
+--  - If a register goes dead, and then becomes live again, the usages
+--    of the disjoint live range don't count towards the original range.
+--
+--          a = 1; // used once
+--          b = a;
+--          a = 2; // used once
+--          c = a;
+--
+--  - A register may be used multiple times, but these all reside in
+--    different control paths, such that any given execution only uses
+--    it once. In that case, the usage count may still be 1.
+--
+--          a = 1; // used once
+--          if (b) {
+--              c = a + 3;
+--          } else {
+--              c = a + 1;
+--          }
+--
+--    This policy corresponds to an inlining strategy that does not
+--    duplicate computation but may increase binary size.
+--
+--  - If we naively implement a usage count, we have a counting to
+--    infinity problem across joins.  Furthermore, knowing that
+--    something is used 2 or more times in one runtime execution isn't
+--    particularly useful for optimizations (inlining may be beneficial,
+--    but there's no way of knowing that without register pressure
+--    information.)
+--
+--          while (...) {
+--              // first iteration, b used once
+--              // second iteration, b used twice
+--              // third iteration ...
+--              a = b;
+--          }
+--          // b used zero times
+--
+--    There is an orthogonal question, which is that for every runtime
+--    execution, the register may be used only once, but if we inline it
+--    in every conditional path, the binary size might increase a lot.
+--    But tracking this information would be tricky, because it violates
+--    the finite lattice restriction Hoopl requires for termination;
+--    we'd thus need to supply an alternate proof, which is probably
+--    something we should defer until we actually have an optimization
+--    that would take advantage of this.  (This might also interact
+--    strangely with liveness information.)
+--
+--          a = ...;
+--          // a is used one time, but in X different paths
+--          case (b) of
+--              1 -> ... a ...
+--              2 -> ... a ...
+--              3 -> ... a ...
+--              ...
+--
+--  This analysis is very similar to liveness analysis; we just keep a
+--  little extra info. (Maybe we should move it to CmmLive, and subsume
+--  the old liveness analysis.)
+
+data RegUsage = SingleUse | ManyUse
+    deriving (Ord, Eq, Show)
+-- Absence in map = ZeroUse
+
+{-
+-- minBound is bottom, maxBound is top, least-upper-bound is max
+-- ToDo: Put this in Hoopl.  Note that this isn't as useful as I
+-- originally hoped, because you usually want to leave out the bottom
+-- element when you have things like this put in maps.  Maybe f is
+-- useful on its own as a combining function.
+boundedOrdLattice :: (Bounded a, Ord a) => String -> DataflowLattice a
+boundedOrdLattice n = DataflowLattice n minBound f
+    where f _ (OldFact x) (NewFact y)
+            | x >= y    = (NoChange,   x)
+            | otherwise = (SomeChange, y)
+-}
+
+-- Custom node type we'll rewrite to.  CmmAssign nodes to local
+-- registers are replaced with AssignLocal nodes.
+data WithRegUsage n e x where
+    Plain       :: n e x -> WithRegUsage n e x
+    AssignLocal :: LocalReg -> CmmExpr -> RegUsage -> WithRegUsage n O O
+
+instance UserOfLocalRegs (n e x) => UserOfLocalRegs (WithRegUsage n e x) where
+    foldRegsUsed f z (Plain n) = foldRegsUsed f z n
+    foldRegsUsed f z (AssignLocal _ e _) = foldRegsUsed f z e
+
+instance DefinerOfLocalRegs (n e x) => DefinerOfLocalRegs (WithRegUsage n e x) where
+    foldRegsDefd f z (Plain n) = foldRegsDefd f z n
+    foldRegsDefd f z (AssignLocal r _ _) = foldRegsDefd f z r
+
+instance NonLocal n => NonLocal (WithRegUsage n) where
+    entryLabel (Plain n) = entryLabel n
+    successors (Plain n) = successors n
+
+liftRegUsage :: Graph n e x -> Graph (WithRegUsage n) e x
+liftRegUsage = mapGraph Plain
+
+eraseRegUsage :: Graph (WithRegUsage CmmNode) e x -> Graph CmmNode e x
+eraseRegUsage = mapGraph f
+    where f :: WithRegUsage CmmNode e x -> CmmNode e x
+          f (AssignLocal l e _) = CmmAssign (CmmLocal l) e
+          f (Plain n) = n
+
+type UsageMap = UniqFM RegUsage
+
+usageLattice :: DataflowLattice UsageMap
+usageLattice = DataflowLattice "usage counts for registers" emptyUFM (joinUFM f)
+    where f _ (OldFact x) (NewFact y)
+            | x >= y    = (NoChange,   x)
+            | otherwise = (SomeChange, y)
+
+-- We reuse the names 'gen' and 'kill', although we're doing something
+-- slightly different from the Dragon Book
+usageTransfer :: BwdTransfer (WithRegUsage CmmNode) UsageMap
+usageTransfer = mkBTransfer3 first middle last
+    where first _ f = f
+          middle :: WithRegUsage CmmNode O O -> UsageMap -> UsageMap
+          middle n f = gen_kill n f
+          last :: WithRegUsage CmmNode O C -> FactBase UsageMap -> UsageMap
+          -- Checking for CmmCall/CmmForeignCall is unnecessary, because
+          -- spills/reloads have already occurred by the time we do this
+          -- analysis.
+          -- XXX Deprecated warning is puzzling: what label are we
+          -- supposed to use?
+          -- ToDo: With a bit more cleverness here, we can avoid
+          -- disappointment and heartbreak associated with the inability
+          -- to inline into CmmCall and CmmForeignCall by
+          -- over-estimating the usage to be ManyUse.
+          last n f = gen_kill n (joinOutFacts usageLattice n f)
+          gen_kill a = gen a . kill a
+          gen  a f = foldRegsUsed increaseUsage f a
+          kill a f = foldRegsDefd delFromUFM f a
+          increaseUsage f r = addToUFM_C combine f r SingleUse
+            where combine _ _ = ManyUse
+
+usageRewrite :: BwdRewrite FuelUniqSM (WithRegUsage CmmNode) UsageMap
+usageRewrite = mkBRewrite3 first middle last
+    where first  _ _ = return Nothing
+          middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))
+          middle (Plain (CmmAssign (CmmLocal l) e)) f
+                     = return . Just
+                     $ case lookupUFM f l of
+                            Nothing    -> emptyGraph
+                            Just usage -> mkMiddle (AssignLocal l e usage)
+          middle _ _ = return Nothing
+          last   _ _ = return Nothing
+
+type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode)
+annotateUsage :: CmmGraph -> FuelUniqSM (CmmGraphWithRegUsage)
+annotateUsage vanilla_g =
+    let g = modifyGraph liftRegUsage vanilla_g
+    in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $
+                                   analRewBwd usageLattice usageTransfer usageRewrite
+
+----------------------------------------------------------------
+--- Assignment tracking
+
+-- The idea is to maintain a map of local registers do expressions,
+-- such that the value of that register is the same as the value of that
+-- expression at any given time.  We can then do several things,
+-- as described by Assignment.
+
+-- Assignment describes the various optimizations that are valid
+-- at a given point in the program.
+data Assignment =
+-- This assignment can always be inlined.  It is cheap or single-use.
+                  AlwaysInline CmmExpr
+-- This assignment should be sunk down to its first use.  (This will
+-- increase code size if the register is used in multiple control flow
+-- paths, but won't increase execution time, and the reduction of
+-- register pressure is worth it.)
+                | AlwaysSink CmmExpr
+-- We cannot safely optimize occurrences of this local register. (This
+-- corresponds to top in the lattice structure.)
+                | NeverOptimize
+
+-- Extract the expression that is being assigned to
+xassign :: Assignment -> Maybe CmmExpr
+xassign (AlwaysInline e) = Just e
+xassign (AlwaysSink e)   = Just e
+xassign NeverOptimize    = Nothing
+
+-- Extracts the expression, but only if they're the same constructor
+xassign2 :: (Assignment, Assignment) -> Maybe (CmmExpr, CmmExpr)
+xassign2 (AlwaysInline e, AlwaysInline e') = Just (e, e')
+xassign2 (AlwaysSink e, AlwaysSink e')     = Just (e, e')
+xassign2 _ = Nothing
+
+-- Note: We'd like to make decisions about "not optimizing" as soon as
+-- possible, because this will make running the transfer function more
+-- efficient.
+type AssignmentMap = UniqFM Assignment
+
+assignmentLattice :: DataflowLattice AssignmentMap
+assignmentLattice = DataflowLattice "assignments for registers" emptyUFM (joinUFM add)
+    where add _ (OldFact old) (NewFact new)
+            = case (old, new) of
+                (NeverOptimize, _) -> (NoChange,   NeverOptimize)
+                (_, NeverOptimize) -> (SomeChange, NeverOptimize)
+                (xassign2 -> Just (e, e'))
+                    | e == e'   -> (NoChange, old)
+                    | otherwise -> (SomeChange, NeverOptimize)
+                _ -> (SomeChange, NeverOptimize)
+
+-- Deletes sinks from assignment map, because /this/ is the place
+-- where it will be sunk to.
+deleteSinks :: UserOfLocalRegs n => n -> AssignmentMap -> AssignmentMap
+deleteSinks n m = foldRegsUsed (adjustUFM f) m n
+  where f (AlwaysSink _) = NeverOptimize
+        f old = old
+
+-- Invalidates any expressions that use a register.
+invalidateUsersOf :: CmmReg -> AssignmentMap -> AssignmentMap
+-- foldUFM_Directly :: (Unique -> elt -> a -> a) -> a -> UniqFM elt -> a
+invalidateUsersOf reg m = foldUFM_Directly f m m -- [foldUFM performance]
+    where f u (xassign -> Just e) m | reg `regUsedIn` e = addToUFM_Directly m u NeverOptimize
+          f _ _ m = m
+{- This requires the entire spine of the map to be continually rebuilt,
+ - which causes crazy memory usage!
+invalidateUsersOf reg = mapUFM (invalidateUsers' reg)
+  where invalidateUsers' reg (xassign -> Just e) | reg `regUsedIn` e = NeverOptimize
+        invalidateUsers' _ old = old
+-}
+
+-- Note [foldUFM performance]
+-- These calls to fold UFM no longer leak memory, but they do cause
+-- pretty killer amounts of allocation.  So they'll be something to
+-- optimize; we need an algorithmic change to prevent us from having to
+-- traverse the /entire/ map continually.
+
+middleAssignment :: WithRegUsage CmmNode O O -> AssignmentMap -> AssignmentMap
+
+-- Algorithm for annotated assignments:
+--  1. Delete any sinking assignments that were used by this instruction
+--  2. Add the assignment to our list of valid local assignments with
+--     the correct optimization policy.
+--  3. Look for all assignments that reference that register and
+--     invalidate them.
+middleAssignment n@(AssignLocal r e usage) assign
+    = invalidateUsersOf (CmmLocal r) . add . deleteSinks n $ assign
+      where add m = addToUFM m r
+                  $ case usage of
+                        SingleUse -> AlwaysInline e
+                        ManyUse   -> decide e
+            decide CmmLit{}       = AlwaysInline e
+            decide CmmReg{}       = AlwaysInline e
+            decide CmmLoad{}      = AlwaysSink e
+            decide CmmStackSlot{} = AlwaysSink e
+            decide CmmMachOp{}    = AlwaysSink e
+            -- We'll always inline simple operations on the global
+            -- registers, to reduce register pressure: Sp - 4 or Hp - 8
+            -- EZY: Justify this optimization more carefully.
+            decide CmmRegOff{}    = AlwaysInline e
+
+-- Algorithm for unannotated assignments of global registers:
+-- 1. Delete any sinking assignments that were used by this instruction
+-- 2. Look for all assignments that reference this register and
+--    invalidate them.
+middleAssignment (Plain n@(CmmAssign reg@(CmmGlobal _) _)) assign
+    = invalidateUsersOf reg . deleteSinks n $ assign
+
+-- Algorithm for unannotated assignments of *local* registers: do
+-- nothing (it's a reload, so no state should have changed)
+middleAssignment (Plain (CmmAssign (CmmLocal _) _)) assign = assign
+
+-- Algorithm for stores:
+--  1. Delete any sinking assignments that were used by this instruction
+--  2. Look for all assignments that load from memory locations that
+--     were clobbered by this store and invalidate them.
+middleAssignment (Plain n@(CmmStore lhs rhs)) assign
+    = let m = deleteSinks n assign
+      in foldUFM_Directly f m m -- [foldUFM performance]
+      where f u (xassign -> Just x) m | (lhs, rhs) `clobbers` (u, x) = addToUFM_Directly m u NeverOptimize
+            f _ _ m = m
+{- Also leaky
+    = mapUFM_Directly p . deleteSinks n $ assign
+      -- ToDo: There's a missed opportunity here: even if a memory
+      -- access we're attempting to sink gets clobbered at some
+      -- location, it's still /better/ to sink it to right before the
+      -- point where it gets clobbered.  How might we do this?
+      -- Unfortunately, it's too late to change the assignment...
+      where p r (xassign -> Just x) | (lhs, rhs) `clobbers` (r, x) = NeverOptimize
+            p _ old = old
+-}
+
+-- Assumption: Unsafe foreign calls don't clobber memory
+-- Since foreign calls clobber caller saved registers, we need
+-- invalidate any assignments that reference those global registers.
+-- This is kind of expensive. (One way to optimize this might be to
+-- store extra information about expressions that allow this and other
+-- checks to be done cheaply.)
+middleAssignment (Plain n@(CmmUnsafeForeignCall{})) assign
+    = deleteCallerSaves (foldRegsDefd (\m r -> addToUFM m r NeverOptimize) (deleteSinks n assign) n)
+    where deleteCallerSaves m = foldUFM_Directly f m m
+          f u (xassign -> Just x) m | wrapRecExpf g x False = addToUFM_Directly m u NeverOptimize
+          f _ _ m = m
+          g (CmmReg (CmmGlobal r)) _      | callerSaves r = True
+          g (CmmRegOff (CmmGlobal r) _) _ | callerSaves r = True
+          g _ b = b
+
+middleAssignment (Plain (CmmComment {})) assign
+    = assign
+
+-- Assumptions:
+--  * Writes using Hp do not overlap with any other memory locations
+--    (An important invariant being relied on here is that we only ever
+--    use Hp to allocate values on the heap, which appears to be the
+--    case given hpReg usage, and that our heap writing code doesn't
+--    do anything stupid like overlapping writes.)
+--  * Stack slots do not overlap with any other memory locations
+--  * Stack slots for different areas do not overlap
+--  * Stack slots within the same area and different offsets may
+--    overlap; we need to do a size check (see 'overlaps').
+--  * Register slots only overlap with themselves.  (But this shouldn't
+--    happen in practice, because we'll fail to inline a reload across
+--    the next spill.)
+--  * Non stack-slot stores always conflict with each other.  (This is
+--    not always the case; we could probably do something special for Hp)
+clobbers :: (CmmExpr, CmmExpr) -- (lhs, rhs) of clobbering CmmStore
+         -> (Unique,  CmmExpr) -- (register, expression) that may be clobbered
+         -> Bool
+clobbers (CmmRegOff (CmmGlobal Hp) _, _) (_, _) = False
+clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
+-- ToDo: Also catch MachOp case
+clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
+    | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
+clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot (CallArea a') o') t)
+            = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
+          f (CmmLoad e _)    = containsStackSlot e
+          f (CmmMachOp _ es) = or (map f es)
+          f _                = False
+          -- Maybe there's an invariant broken if this actually ever
+          -- returns True
+          containsStackSlot (CmmLoad{})      = True -- load of a load, all bets off
+          containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
+          containsStackSlot (CmmStackSlot{}) = True
+          containsStackSlot _ = False
+clobbers (CmmStackSlot (RegSlot l) _, _) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot (RegSlot l') _) _) = l == l'
+          f _ = False
+clobbers _ (_, e) = f e
+    where f (CmmLoad (CmmStackSlot _ _) _) = False
+          f (CmmLoad{}) = True -- conservative
+          f (CmmMachOp _ es) = or (map f es)
+          f _ = False
+
+-- Check for memory overlapping.
+-- Diagram:
+--      4      8     12
+--      s -w-  o
+--      [ I32  ]
+--      [    F64     ]
+--      s'   -w'-    o'
+type CallSubArea = (AreaId, Int, Int) -- area, offset, width
+overlaps :: CallSubArea -> CallSubArea -> Bool
+overlaps (a, _, _) (a', _, _) | a /= a' = False
+overlaps (_, o, w) (_, o', w') =
+    let s  = o  - w
+        s' = o' - w'
+    in (s' < o) && (s < o) -- Not LTE, because [ I32  ][ I32  ] is OK
+
+lastAssignment :: WithRegUsage CmmNode O C -> AssignmentMap -> [(Label, AssignmentMap)]
+-- Variables are dead across calls, so invalidating all mappings is justified
+lastAssignment (Plain (CmmCall _ (Just k) _ _ _)) assign = [(k, mapUFM (const NeverOptimize) assign)]
+lastAssignment (Plain (CmmForeignCall {succ=k}))  assign = [(k, mapUFM (const NeverOptimize) assign)]
+lastAssignment l assign = map (\id -> (id, deleteSinks l assign)) $ successors l
+
+assignmentTransfer :: FwdTransfer (WithRegUsage CmmNode) AssignmentMap
+assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase assignmentLattice .) . lastAssignment)
+
+assignmentRewrite :: FwdRewrite FuelUniqSM (WithRegUsage CmmNode) AssignmentMap
+assignmentRewrite = mkFRewrite3 first middle last
+    where
+        first _ _ = return Nothing
+        middle :: WithRegUsage CmmNode O O -> AssignmentMap -> GenCmmReplGraph (WithRegUsage CmmNode) O O
+        middle (Plain m) assign = return $ rewrite assign (precompute assign m) mkMiddle m
+        middle (AssignLocal l e u) assign = return $ rewriteLocal assign (precompute assign (CmmAssign (CmmLocal l) e)) mkMiddle l e u
+        last (Plain l) assign = return $ rewrite assign (precompute assign l) mkLast l
+        -- Tuple is (inline?, reloads)
+        precompute assign n = foldRegsUsed f (False, []) n -- duplicates are harmless
+            where f (i, l) r = case lookupUFM assign r of
+                                Just (AlwaysSink e)   -> (i, (Plain (CmmAssign (CmmLocal r) e)):l)
+                                Just (AlwaysInline _) -> (True, l)
+                                Just NeverOptimize    -> (i, l)
+                                -- This case can show up when we have
+                                -- limited optimization fuel.
+                                Nothing -> (i, l)
+        rewrite _ (False, []) _ _ = Nothing
+        -- Note [CmmCall Inline Hack]
+        -- Conservative hack: don't do any inlining on what will
+        -- be translated into an OldCmm CmmCalls, since the code
+        -- produced here tends to be unproblematic and I need to write
+        -- lint passes to ensure that we don't put anything in the
+        -- arguments that could be construed as a global register by
+        -- some later translation pass.  (For example, slots will turn
+        -- into dereferences of Sp).  See [Register parameter passing].
+        -- ToDo: Fix this up to only bug out if all inlines were for
+        -- CmmExprs with global registers (we can't use the
+        -- straightforward mapExpDeep call, in this case.) ToDo: We miss
+        -- an opportunity here, where all possible inlinings should
+        -- instead be sunk.
+        rewrite _ (True, []) _ n | not (inlinable n) = Nothing -- see [CmmCall Inline Hack]
+        rewrite assign (i, xs) mk n = Just $ mkMiddles xs <*> mk (Plain (inline i assign n))
+
+        rewriteLocal _ (False, []) _ _ _ _ = Nothing
+        rewriteLocal assign (i, xs) mk l e u = Just $ mkMiddles xs <*> mk n'
+            where n' = AssignLocal l e' u
+                  e' = if i then wrapRecExp (inlineExp assign) e else e
+            -- inlinable check omitted, since we can always inline into
+            -- assignments.
+
+        inline :: Bool -> AssignmentMap -> CmmNode e x -> CmmNode e x
+        inline False _ n = n
+        inline True  _ n | not (inlinable n) = n -- see [CmmCall Inline Hack]
+        inline True assign n = mapExpDeep (inlineExp assign) n
+
+        inlineExp assign old@(CmmReg (CmmLocal r))
+          = case lookupUFM assign r of
+              Just (AlwaysInline x) -> x
+              _ -> old
+        inlineExp assign old@(CmmRegOff (CmmLocal r) i)
+          = case lookupUFM assign r of
+              Just (AlwaysInline x) ->
+                case x of
+                    (CmmRegOff r' i') -> CmmRegOff r' (i + i')
+                    _ -> CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
+                          where rep = typeWidth (localRegType r)
+              _ -> old
+        inlineExp _ old = old
+
+        inlinable :: CmmNode e x -> Bool
+        inlinable (CmmCall{}) = False
+        inlinable (CmmForeignCall{}) = False
+        inlinable (CmmUnsafeForeignCall{}) = False
+        inlinable _ = True
+
+rewriteAssignments :: CmmGraph -> FuelUniqSM CmmGraph
+rewriteAssignments g = do
+  g'  <- annotateUsage g
+  g'' <- liftM fst $ dataflowPassFwd g' [(g_entry g, fact_bot assignmentLattice)] $
+                                     analRewFwd assignmentLattice assignmentTransfer assignmentRewrite
+  return (modifyGraph eraseRegUsage g'')
 
 ---------------------
 -- prettyprinting
@@ -305,11 +671,7 @@ instance Outputable DualLive where
                          if isEmptyUniqSet stack then PP.empty
                          else (ppr_regs "live on stack =" stack)]
 
-instance Outputable AvailRegs where
-  ppr (UniverseMinus s) = if isEmptyUniqSet s then text "<everything available>"
-                          else ppr_regs "available = all but" s
-  ppr (AvailRegs     s) = if isEmptyUniqSet s then text "<nothing available>"
-                          else ppr_regs "available = " s
+-- ToDo: Outputable instance for UsageMap and AssignmentMap
 
 my_trace :: String -> SDoc -> a -> a
 my_trace = if False then pprTrace else \_ _ a -> a
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index 57d458c..f5c0817 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -144,12 +144,14 @@ data CmmStmt	-- Old-style
   | CmmStore CmmExpr CmmExpr     -- Assign to memory location.  Size is
                                  -- given by cmmExprType of the rhs.
 
-  | CmmCall	 		 -- A call (forign, native or primitive), with 
+  | CmmCall	 		 -- A call (foreign, native or primitive), with 
      CmmCallTarget
      HintedCmmFormals		 -- zero or more results
      HintedCmmActuals		 -- zero or more arguments
      CmmSafety			 -- whether to build a continuation
      CmmReturnInfo
+  -- Some care is necessary when handling the arguments of these, see
+  -- [Register parameter passing] and the hack in cmm/CmmOpt.hs
 
   | CmmBranch BlockId             -- branch to another BB in this fn
 
diff --git a/compiler/codeGen/StgCmmUtils.hs b/compiler/codeGen/StgCmmUtils.hs
index 48416e3..d917811 100644
--- a/compiler/codeGen/StgCmmUtils.hs
+++ b/compiler/codeGen/StgCmmUtils.hs
@@ -340,6 +340,23 @@ emitRtsCall' res pkg fun args _vols safe
 --  * Regs.h claims that BaseReg should be saved last and loaded first
 --    * This might not have been tickled before since BaseReg is callee save
 --  * Regs.h saves SparkHd, ParkT1, SparkBase and SparkLim
+--
+-- This code isn't actually used right now, because callerSaves
+-- only ever returns true in the current universe for registers NOT in
+-- system_regs (just do a grep for CALLER_SAVES in
+-- includes/stg/MachRegs.h).  It's all one giant no-op, and for
+-- good reason: having to save system registers on every foreign call
+-- would be very expensive, so we avoid assigning them to those
+-- registers when we add support for an architecture.
+--
+-- Note that the old code generator actually does more work here: it
+-- also saves other global registers.  We can't (nor want) to do that
+-- here, as we don't have liveness information.  And really, we
+-- shouldn't be doing the workaround at this point in the pipeline, see
+-- Note [Register parameter passing] and the ToDo on CmmCall in
+-- cmm/CmmNode.hs.  Right now the workaround is to avoid inlining across
+-- unsafe foreign calls in rewriteAssignments, but this is strictly
+-- temporary.
 callerSaveVolatileRegs :: (CmmAGraph, CmmAGraph)
 callerSaveVolatileRegs = (caller_save, caller_load)
   where
@@ -396,6 +413,51 @@ callerSaves :: GlobalReg -> Bool
 #ifdef CALLER_SAVES_Base
 callerSaves BaseReg		= True
 #endif
+#ifdef CALLER_SAVES_R1
+callerSaves (VanillaReg 1 _)	= True
+#endif
+#ifdef CALLER_SAVES_R2
+callerSaves (VanillaReg 2 _)	= True
+#endif
+#ifdef CALLER_SAVES_R3
+callerSaves (VanillaReg 3 _)	= True
+#endif
+#ifdef CALLER_SAVES_R4
+callerSaves (VanillaReg 4 _)	= True
+#endif
+#ifdef CALLER_SAVES_R5
+callerSaves (VanillaReg 5 _)	= True
+#endif
+#ifdef CALLER_SAVES_R6
+callerSaves (VanillaReg 6 _)	= True
+#endif
+#ifdef CALLER_SAVES_R7
+callerSaves (VanillaReg 7 _)	= True
+#endif
+#ifdef CALLER_SAVES_R8
+callerSaves (VanillaReg 8 _)	= True
+#endif
+#ifdef CALLER_SAVES_F1
+callerSaves (FloatReg 1)	= True
+#endif
+#ifdef CALLER_SAVES_F2
+callerSaves (FloatReg 2)	= True
+#endif
+#ifdef CALLER_SAVES_F3
+callerSaves (FloatReg 3)	= True
+#endif
+#ifdef CALLER_SAVES_F4
+callerSaves (FloatReg 4)	= True
+#endif
+#ifdef CALLER_SAVES_D1
+callerSaves (DoubleReg 1)	= True
+#endif
+#ifdef CALLER_SAVES_D2
+callerSaves (DoubleReg 2)	= True
+#endif
+#ifdef CALLER_SAVES_L1
+callerSaves (LongReg 1)		= True
+#endif
 #ifdef CALLER_SAVES_Sp
 callerSaves Sp			= True
 #endif
diff --git a/compiler/deSugar/Check.lhs b/compiler/deSugar/Check.lhs
index 47fbf2e..59c102f 100644
--- a/compiler/deSugar/Check.lhs
+++ b/compiler/deSugar/Check.lhs
@@ -30,6 +30,7 @@ import Type
 import SrcLoc
 import UniqSet
 import Util
+import BasicTypes
 import Outputable
 import FastString
 \end{code}
@@ -436,14 +437,14 @@ get_lit :: Pat id -> Maybe HsLit
 -- It doesn't matter which one, because they will only be compared
 -- with other HsLits gotten in the same way
 get_lit (LitPat lit)				          = Just lit
-get_lit (NPat (OverLit { ol_val = HsIntegral i})    mb _) = Just (HsIntPrim   (mb_neg mb i))
-get_lit (NPat (OverLit { ol_val = HsFractional f }) mb _) = Just (HsFloatPrim (mb_neg mb f))
+get_lit (NPat (OverLit { ol_val = HsIntegral i})    mb _) = Just (HsIntPrim   (mb_neg negate              mb i))
+get_lit (NPat (OverLit { ol_val = HsFractional f }) mb _) = Just (HsFloatPrim (mb_neg negateFractionalLit mb f))
 get_lit (NPat (OverLit { ol_val = HsIsString s })   _  _) = Just (HsStringPrim s)
 get_lit _                                	          = Nothing
 
-mb_neg :: Num a => Maybe b -> a -> a
-mb_neg Nothing  v = v
-mb_neg (Just _) v = -v
+mb_neg :: (a -> a) -> Maybe b -> a -> a
+mb_neg _      Nothing  v = v
+mb_neg negate (Just _) v = negate v
 
 get_unused_cons :: [Pat Id] -> [DataCon]
 get_unused_cons used_cons = ASSERT( not (null used_cons) ) unused_cons
diff --git a/compiler/deSugar/DsMeta.hs b/compiler/deSugar/DsMeta.hs
index 3a2dda8..a4b47ee 100644
--- a/compiler/deSugar/DsMeta.hs
+++ b/compiler/deSugar/DsMeta.hs
@@ -1583,7 +1583,7 @@ repLiteral lit
 mk_integer :: Integer -> DsM HsLit
 mk_integer  i = do integer_ty <- lookupType integerTyConName
                    return $ HsInteger i integer_ty
-mk_rational :: Rational -> DsM HsLit
+mk_rational :: FractionalLit -> DsM HsLit
 mk_rational r = do rat_ty <- lookupType rationalTyConName
                    return $ HsRat r rat_ty
 mk_string :: FastString -> DsM HsLit
diff --git a/compiler/deSugar/MatchLit.lhs b/compiler/deSugar/MatchLit.lhs
index be112e0..0bd2538 100644
--- a/compiler/deSugar/MatchLit.lhs
+++ b/compiler/deSugar/MatchLit.lhs
@@ -33,6 +33,7 @@ import Literal
 import SrcLoc
 import Data.Ratio
 import Outputable
+import BasicTypes
 import Util
 import FastString
 \end{code}
@@ -64,8 +65,8 @@ dsLit (HsStringPrim s) = return (Lit (MachStr s))
 dsLit (HsCharPrim   c) = return (Lit (MachChar c))
 dsLit (HsIntPrim    i) = return (Lit (MachInt i))
 dsLit (HsWordPrim   w) = return (Lit (MachWord w))
-dsLit (HsFloatPrim  f) = return (Lit (MachFloat f))
-dsLit (HsDoublePrim d) = return (Lit (MachDouble d))
+dsLit (HsFloatPrim  f) = return (Lit (MachFloat (fl_value f)))
+dsLit (HsDoublePrim d) = return (Lit (MachDouble (fl_value d)))
 
 dsLit (HsChar c)       = return (mkCharExpr c)
 dsLit (HsString str)   = mkStringExprFS str
@@ -73,8 +74,8 @@ dsLit (HsInteger i _)  = mkIntegerExpr i
 dsLit (HsInt i)	       = return (mkIntExpr i)
 
 dsLit (HsRat r ty) = do
-   num   <- mkIntegerExpr (numerator r)
-   denom <- mkIntegerExpr (denominator r)
+   num   <- mkIntegerExpr (numerator (fl_value r))
+   denom <- mkIntegerExpr (denominator (fl_value r))
    return (mkConApp ratio_data_con [Type integer_ty, num, denom])
   where
     (ratio_data_con, integer_ty) 
@@ -112,8 +113,8 @@ hsLitKey (HsIntPrim     i) = mkMachInt  i
 hsLitKey (HsWordPrim    w) = mkMachWord w
 hsLitKey (HsCharPrim    c) = MachChar   c
 hsLitKey (HsStringPrim  s) = MachStr    s
-hsLitKey (HsFloatPrim   f) = MachFloat  f
-hsLitKey (HsDoublePrim  d) = MachDouble d
+hsLitKey (HsFloatPrim   f) = MachFloat  (fl_value f)
+hsLitKey (HsDoublePrim  d) = MachDouble (fl_value d)
 hsLitKey (HsString s)	   = MachStr    s
 hsLitKey l                 = pprPanic "hsLitKey" (ppr l)
 
@@ -124,8 +125,8 @@ hsOverLitKey (OverLit { ol_val = l }) neg = litValKey l neg
 litValKey :: OverLitVal -> Bool -> Literal
 litValKey (HsIntegral i)   False = MachInt i
 litValKey (HsIntegral i)   True  = MachInt (-i)
-litValKey (HsFractional r) False = MachFloat r
-litValKey (HsFractional r) True  = MachFloat (-r)
+litValKey (HsFractional r) False = MachFloat (fl_value r)
+litValKey (HsFractional r) True  = MachFloat (negate (fl_value r))
 litValKey (HsIsString s)   neg   = ASSERT( not neg) MachStr s
 \end{code}
 
@@ -186,12 +187,12 @@ tidyNPat tidy_lit_pat (OverLit val False _ ty) mb_neg _
 		   (Just _,  HsIntegral i) -> Just (-i)
 		   _ -> Nothing
 	
-    mb_rat_lit :: Maybe Rational
+    mb_rat_lit :: Maybe FractionalLit
     mb_rat_lit = case (mb_neg, val) of
-		   (Nothing, HsIntegral   i) -> Just (fromInteger i)
-		   (Just _,  HsIntegral   i) -> Just (fromInteger (-i))
+		   (Nothing, HsIntegral   i) -> Just (integralFractionalLit (fromInteger i))
+		   (Just _,  HsIntegral   i) -> Just (integralFractionalLit (fromInteger (-i)))
 		   (Nothing, HsFractional f) -> Just f
-		   (Just _, HsFractional f)  -> Just (-f)
+		   (Just _, HsFractional f)  -> Just (negateFractionalLit f)
 		   _ -> Nothing
 	
     mb_str_lit :: Maybe FastString
diff --git a/compiler/ghci/ObjLink.lhs b/compiler/ghci/ObjLink.lhs
index 310ddb5..cd593f7 100644
--- a/compiler/ghci/ObjLink.lhs
+++ b/compiler/ghci/ObjLink.lhs
@@ -28,6 +28,8 @@ import Control.Monad    ( when )
 import Foreign.C
 import Foreign		( nullPtr )
 import GHC.Exts         ( Ptr(..) )
+import GHC.IO.Encoding  ( fileSystemEncoding )
+import qualified GHC.Foreign as GHC
 
 
 
@@ -35,17 +37,21 @@ import GHC.Exts         ( Ptr(..) )
 -- RTS Linker Interface
 -- ---------------------------------------------------------------------------
 
+-- UNICODE FIXME: Unicode object/archive/DLL file names on Windows will only work in the right code page
+withFileCString :: FilePath -> (CString -> IO a) -> IO a
+withFileCString = GHC.withCString fileSystemEncoding
+
 insertSymbol :: String -> String -> Ptr a -> IO ()
 insertSymbol obj_name key symbol
     = let str = prefixUnderscore key
-      in withCString obj_name $ \c_obj_name ->
-         withCString str $ \c_str ->
+      in withFileCString obj_name $ \c_obj_name ->
+         withCAString str $ \c_str ->
           c_insertSymbol c_obj_name c_str symbol
 
 lookupSymbol :: String -> IO (Maybe (Ptr a))
 lookupSymbol str_in = do
    let str = prefixUnderscore str_in
-   withCString str $ \c_str -> do
+   withCAString str $ \c_str -> do
      addr <- c_lookupSymbol c_str
      if addr == nullPtr
 	then return Nothing
@@ -60,7 +66,7 @@ loadDLL :: String -> IO (Maybe String)
 -- Nothing      => success
 -- Just err_msg => failure
 loadDLL str = do
-  maybe_errmsg <- withCString str $ \dll -> c_addDLL dll
+  maybe_errmsg <- withFileCString str $ \dll -> c_addDLL dll
   if maybe_errmsg == nullPtr
 	then return Nothing
 	else do str <- peekCString maybe_errmsg
@@ -68,19 +74,19 @@ loadDLL str = do
 
 loadArchive :: String -> IO ()
 loadArchive str = do
-   withCString str $ \c_str -> do
+   withFileCString str $ \c_str -> do
      r <- c_loadArchive c_str
      when (r == 0) (panic ("loadArchive " ++ show str ++ ": failed"))
 
 loadObj :: String -> IO ()
 loadObj str = do
-   withCString str $ \c_str -> do
+   withFileCString str $ \c_str -> do
      r <- c_loadObj c_str
      when (r == 0) (panic ("loadObj " ++ show str ++ ": failed"))
 
 unloadObj :: String -> IO ()
 unloadObj str =
-   withCString str $ \c_str -> do
+   withFileCString str $ \c_str -> do
      r <- c_unloadObj c_str
      when (r == 0) (panic ("unloadObj " ++ show str ++ ": failed"))
 
diff --git a/compiler/hsSyn/Convert.lhs b/compiler/hsSyn/Convert.lhs
index 5933e9d..492f255 100644
--- a/compiler/hsSyn/Convert.lhs
+++ b/compiler/hsSyn/Convert.lhs
@@ -568,7 +568,7 @@ cvtOverLit :: Lit -> CvtM (HsOverLit RdrName)
 cvtOverLit (IntegerL i)  
   = do { force i; return $ mkHsIntegral i placeHolderType}
 cvtOverLit (RationalL r) 
-  = do { force r; return $ mkHsFractional r placeHolderType}
+  = do { force r; return $ mkHsFractional (cvtFractionalLit r) placeHolderType}
 cvtOverLit (StringL s)   
   = do { let { s' = mkFastString s }
        ; force s'
@@ -602,8 +602,8 @@ allCharLs xs
 cvtLit :: Lit -> CvtM HsLit
 cvtLit (IntPrimL i)    = do { force i; return $ HsIntPrim i }
 cvtLit (WordPrimL w)   = do { force w; return $ HsWordPrim w }
-cvtLit (FloatPrimL f)  = do { force f; return $ HsFloatPrim f }
-cvtLit (DoublePrimL f) = do { force f; return $ HsDoublePrim f }
+cvtLit (FloatPrimL f)  = do { force f; return $ HsFloatPrim (cvtFractionalLit f) }
+cvtLit (DoublePrimL f) = do { force f; return $ HsDoublePrim (cvtFractionalLit f) }
 cvtLit (CharL c)       = do { force c; return $ HsChar c }
 cvtLit (StringL s)     = do { let { s' = mkFastString s }
        		       	    ; force s'      
@@ -768,6 +768,9 @@ overloadedLit _             = False
 void :: Type.Type
 void = placeHolderType
 
+cvtFractionalLit :: Rational -> FractionalLit
+cvtFractionalLit r = FL { fl_text = show (fromRational r :: Double), fl_value = r }
+
 --------------------------------------------------------------------
 --	Turning Name back into RdrName
 --------------------------------------------------------------------
diff --git a/compiler/hsSyn/HsBinds.lhs b/compiler/hsSyn/HsBinds.lhs
index 81b5f18..5871914 100644
--- a/compiler/hsSyn/HsBinds.lhs
+++ b/compiler/hsSyn/HsBinds.lhs
@@ -69,23 +69,23 @@ data HsLocalBindsLR idL idR	-- Bindings in a 'let' expression
 type HsValBinds id = HsValBindsLR id id
 
 data HsValBindsLR idL idR  -- Value bindings (not implicit parameters)
-  = ValBindsIn             -- Before renaming
+  = ValBindsIn             -- Before renaming RHS; idR is always RdrName
 	(LHsBindsLR idL idR) [LSig idR]	-- Not dependency analysed
 					-- Recursive by default
 
-  | ValBindsOut		   -- After renaming
+  | ValBindsOut		   -- After renaming RHS; idR can be Name or Id
 	[(RecFlag, LHsBinds idL)]	-- Dependency analysed, later bindings 
                                         -- in the list may depend on earlier
                                         -- ones.
 	[LSig Name]
   deriving (Data, Typeable)
 
-type LHsBinds id = Bag (LHsBind id)
-type LHsBind  id = Located (HsBind id)
-type HsBind id   = HsBindLR id id
+type LHsBind  id = LHsBindLR  id id
+type LHsBinds id = LHsBindsLR id id
+type HsBind   id = HsBindLR   id id
 
-type LHsBindLR idL idR = Located (HsBindLR idL idR)
 type LHsBindsLR idL idR = Bag (LHsBindLR idL idR)
+type LHsBindLR  idL idR = Located (HsBindLR idL idR)
 
 data HsBindLR idL idR
   = -- | FunBind is used for both functions   @f x = e@
diff --git a/compiler/hsSyn/HsLit.lhs b/compiler/hsSyn/HsLit.lhs
index 4a565ff..2cda103 100644
--- a/compiler/hsSyn/HsLit.lhs
+++ b/compiler/hsSyn/HsLit.lhs
@@ -12,7 +12,8 @@ module HsLit where
 #include "HsVersions.h"
 
 import {-# SOURCE #-} HsExpr( SyntaxExpr, pprExpr )
-import HsTypes (PostTcType)
+import BasicTypes ( FractionalLit(..) )
+import HsTypes  ( PostTcType )
 import Type	( Type )
 import Outputable
 import FastString
@@ -40,10 +41,10 @@ data HsLit
   | HsWordPrim	    Integer		-- Unboxed Word
   | HsInteger	    Integer  Type	-- Genuinely an integer; arises only from TRANSLATION
 					-- 	(overloaded literals are done with HsOverLit)
-  | HsRat	    Rational Type	-- Genuinely a rational; arises only from TRANSLATION
+  | HsRat	    FractionalLit Type	-- Genuinely a rational; arises only from TRANSLATION
 					-- 	(overloaded literals are done with HsOverLit)
-  | HsFloatPrim	    Rational		-- Unboxed Float
-  | HsDoublePrim    Rational		-- Unboxed Double
+  | HsFloatPrim	    FractionalLit	-- Unboxed Float
+  | HsDoublePrim    FractionalLit	-- Unboxed Double
   deriving (Data, Typeable)
 
 instance Eq HsLit where
@@ -70,7 +71,7 @@ data HsOverLit id 	-- An overloaded literal
 
 data OverLitVal
   = HsIntegral   !Integer   	-- Integer-looking literals;
-  | HsFractional !Rational   	-- Frac-looking literals
+  | HsFractional !FractionalLit	-- Frac-looking literals
   | HsIsString   !FastString 	-- String-looking literals
   deriving (Data, Typeable)
 
@@ -142,9 +143,9 @@ instance Outputable HsLit where
     ppr (HsStringPrim s) = pprHsString s <> char '#'
     ppr (HsInt i)	 = integer i
     ppr (HsInteger i _)	 = integer i
-    ppr (HsRat f _)	 = rational f
-    ppr (HsFloatPrim f)	 = rational f <> char '#'
-    ppr (HsDoublePrim d) = rational d <> text "##"
+    ppr (HsRat f _)	 = ppr f
+    ppr (HsFloatPrim f)	 = ppr f <> char '#'
+    ppr (HsDoublePrim d) = ppr d <> text "##"
     ppr (HsIntPrim i)	 = integer i  <> char '#'
     ppr (HsWordPrim w)	 = integer w  <> text "##"
 
@@ -155,6 +156,6 @@ instance OutputableBndr id => Outputable (HsOverLit id) where
 
 instance Outputable OverLitVal where
   ppr (HsIntegral i)   = integer i 
-  ppr (HsFractional f) = rational f
+  ppr (HsFractional f) = ppr f
   ppr (HsIsString s)   = pprHsString s
 \end{code}
diff --git a/compiler/hsSyn/HsUtils.lhs b/compiler/hsSyn/HsUtils.lhs
index dfb3dd5..cc57e05 100644
--- a/compiler/hsSyn/HsUtils.lhs
+++ b/compiler/hsSyn/HsUtils.lhs
@@ -84,7 +84,6 @@ import NameSet
 import BasicTypes
 import SrcLoc
 import FastString
-import Outputable
 import Util
 import Bag
 
@@ -188,7 +187,7 @@ mkSimpleHsAlt pat expr
 -- See RnEnv.lookupSyntaxName
 
 mkHsIntegral   :: Integer -> PostTcType -> HsOverLit id
-mkHsFractional :: Rational -> PostTcType -> HsOverLit id
+mkHsFractional :: FractionalLit -> PostTcType -> HsOverLit id
 mkHsIsString   :: FastString -> PostTcType -> HsOverLit id
 mkHsDo         :: HsStmtContext Name -> [LStmt id] -> HsExpr id
 mkHsComp       :: HsStmtContext Name -> [LStmt id] -> LHsExpr id -> HsExpr id
@@ -664,11 +663,15 @@ lStmtsImplicits = hs_lstmts
 
 hsValBindsImplicits :: HsValBindsLR Name idR -> NameSet
 hsValBindsImplicits (ValBindsOut binds _)
-  = unionManyNameSets [foldBag unionNameSets (hs_bind . unLoc) emptyNameSet hs_binds | (_rec, hs_binds) <- binds]
+  = foldr (unionNameSets . lhsBindsImplicits . snd) emptyNameSet binds
+hsValBindsImplicits (ValBindsIn binds _) 
+  = lhsBindsImplicits binds
+
+lhsBindsImplicits :: LHsBindsLR Name idR -> NameSet
+lhsBindsImplicits = foldBag unionNameSets lhs_bind emptyNameSet
   where
-    hs_bind (PatBind { pat_lhs = lpat }) = lPatImplicits lpat
-    hs_bind _ = emptyNameSet
-hsValBindsImplicits (ValBindsIn {}) = pprPanic "hsValBindsImplicits: ValBindsIn" empty
+    lhs_bind (L _ (PatBind { pat_lhs = lpat })) = lPatImplicits lpat
+    lhs_bind _ = emptyNameSet
 
 lPatImplicits :: LPat Name -> NameSet
 lPatImplicits = hs_lpat
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 87bf391..d80d2a6 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -108,6 +108,8 @@ import Data.Char
 import Data.List
 import Data.Map (Map)
 import qualified Data.Map as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
 import System.FilePath
 import System.IO        ( stderr, hPutChar )
 
@@ -122,6 +124,21 @@ data DynFlag
    | Opt_D_dump_raw_cmm
    | Opt_D_dump_cmmz
    | Opt_D_dump_cmmz_pretty
+   -- All of the cmmz subflags (there are a lot!)  Automatically
+   -- enabled if you run -ddump-cmmz
+   | Opt_D_dump_cmmz_cbe
+   | Opt_D_dump_cmmz_proc
+   | Opt_D_dump_cmmz_spills
+   | Opt_D_dump_cmmz_rewrite
+   | Opt_D_dump_cmmz_dead
+   | Opt_D_dump_cmmz_stub
+   | Opt_D_dump_cmmz_sp
+   | Opt_D_dump_cmmz_procmap
+   | Opt_D_dump_cmmz_split
+   | Opt_D_dump_cmmz_lower
+   | Opt_D_dump_cmmz_info
+   | Opt_D_dump_cmmz_cafs
+   -- end cmmz subflags
    | Opt_D_dump_cps_cmm
    | Opt_D_dump_cvt_cmm
    | Opt_D_dump_asm
@@ -481,6 +498,11 @@ data DynFlags = DynFlags {
   filesToClean          :: IORef [FilePath],
   dirsToClean           :: IORef (Map FilePath FilePath),
 
+  -- Names of files which were generated from -ddump-to-file; used to
+  -- track which ones we need to truncate because it's our first run
+  -- through
+  generatedDumps        :: IORef (Set FilePath),
+
   -- hsc dynamic flags
   flags                 :: [DynFlag],
   -- Don't change this without updating extensionFlags:
@@ -717,12 +739,14 @@ initDynFlags dflags = do
  ways <- readIORef v_Ways
  refFilesToClean <- newIORef []
  refDirsToClean <- newIORef Map.empty
+ refGeneratedDumps <- newIORef Set.empty
  return dflags{
         ways            = ways,
         buildTag        = mkBuildTag (filter (not . wayRTSOnly) ways),
         rtsBuildTag     = mkBuildTag ways,
         filesToClean    = refFilesToClean,
-        dirsToClean     = refDirsToClean
+        dirsToClean     = refDirsToClean,
+        generatedDumps   = refGeneratedDumps
         }
 
 -- | The normal 'DynFlags'. Note that they is not suitable for use in this form
@@ -798,6 +822,7 @@ defaultDynFlags mySettings =
         -- end of ghc -M values
         filesToClean   = panic "defaultDynFlags: No filesToClean",
         dirsToClean    = panic "defaultDynFlags: No dirsToClean",
+        generatedDumps = panic "defaultDynFlags: No generatedDumps",
         haddockOptions = Nothing,
         flags = defaultFlags,
         language = Nothing,
@@ -1187,8 +1212,8 @@ dynamic_flags = [
   , Flag "dylib-install-name" (hasArg setDylibInstallName)
 
         ------- Libraries ---------------------------------------------------
-  , Flag "L"   (Prefix    addLibraryPath)
-  , Flag "l"   (AnySuffix (upd . addOptl))
+  , Flag "L"   (Prefix addLibraryPath)
+  , Flag "l"   (hasArg (addOptl . ("-l" ++)))
 
         ------- Frameworks --------------------------------------------------
         -- -framework-path should really be -F ...
@@ -1256,6 +1281,18 @@ dynamic_flags = [
   , Flag "ddump-raw-cmm"           (setDumpFlag Opt_D_dump_raw_cmm)
   , Flag "ddump-cmmz"              (setDumpFlag Opt_D_dump_cmmz)
   , Flag "ddump-cmmz-pretty"       (setDumpFlag Opt_D_dump_cmmz_pretty)
+  , Flag "ddump-cmmz-cbe"          (setDumpFlag Opt_D_dump_cmmz_cbe)
+  , Flag "ddump-cmmz-spills"       (setDumpFlag Opt_D_dump_cmmz_spills)
+  , Flag "ddump-cmmz-proc"         (setDumpFlag Opt_D_dump_cmmz_proc)
+  , Flag "ddump-cmmz-rewrite"      (setDumpFlag Opt_D_dump_cmmz_rewrite)
+  , Flag "ddump-cmmz-dead"         (setDumpFlag Opt_D_dump_cmmz_dead)
+  , Flag "ddump-cmmz-stub"         (setDumpFlag Opt_D_dump_cmmz_stub)
+  , Flag "ddump-cmmz-sp"           (setDumpFlag Opt_D_dump_cmmz_sp)
+  , Flag "ddump-cmmz-procmap"      (setDumpFlag Opt_D_dump_cmmz_procmap)
+  , Flag "ddump-cmmz-split"        (setDumpFlag Opt_D_dump_cmmz_split)
+  , Flag "ddump-cmmz-lower"        (setDumpFlag Opt_D_dump_cmmz_lower)
+  , Flag "ddump-cmmz-info"         (setDumpFlag Opt_D_dump_cmmz_info)
+  , Flag "ddump-cmmz-cafs"         (setDumpFlag Opt_D_dump_cmmz_cafs)
   , Flag "ddump-core-stats"        (setDumpFlag Opt_D_dump_core_stats)
   , Flag "ddump-cps-cmm"           (setDumpFlag Opt_D_dump_cps_cmm)
   , Flag "ddump-cvt-cmm"           (setDumpFlag Opt_D_dump_cvt_cmm)
diff --git a/compiler/main/ErrUtils.lhs b/compiler/main/ErrUtils.lhs
index b6297a2..1c7a389 100644
--- a/compiler/main/ErrUtils.lhs
+++ b/compiler/main/ErrUtils.lhs
@@ -41,6 +41,9 @@ import StaticFlags	( opt_ErrorSpans )
 
 import System.Exit	( ExitCode(..), exitWith )
 import Data.List
+import qualified Data.Set as Set
+import Data.IORef
+import Control.Monad
 import System.IO
 
 -- -----------------------------------------------------------------------------
@@ -208,19 +211,26 @@ mkDumpDoc hdr doc
 --	otherwise emit to stdout.
 dumpSDoc :: DynFlags -> DynFlag -> String -> SDoc -> IO ()
 dumpSDoc dflags dflag hdr doc
- = do	let mFile	= chooseDumpFile dflags dflag
- 	case mFile of
-		-- write the dump to a file
-		--	don't add the header in this case, we can see what kind
-		--	of dump it is from the filename.
-		Just fileName
-		 -> do	handle	<- openFile fileName AppendMode
-		 	hPrintDump handle doc
-		 	hClose handle
-
-		-- write the dump to stdout
-		Nothing
-		 -> do	printDump (mkDumpDoc hdr doc)
+ = do let mFile = chooseDumpFile dflags dflag
+      case mFile of
+            -- write the dump to a file
+            -- don't add the header in this case, we can see what kind
+            -- of dump it is from the filename.
+            Just fileName
+                 -> do
+                        let gdref = generatedDumps dflags
+                        gd <- readIORef gdref
+                        let append = Set.member fileName gd
+                            mode = if append then AppendMode else WriteMode
+                        when (not append) $
+                            writeIORef gdref (Set.insert fileName gd)
+                        handle <- openFile fileName mode
+                        hPrintDump handle doc
+                        hClose handle
+
+            -- write the dump to stdout
+            Nothing
+                 -> printDump (mkDumpDoc hdr doc)
 
 
 -- | Choose where to put a dump file based on DynFlags
diff --git a/compiler/main/SysTools.lhs b/compiler/main/SysTools.lhs
index 436cfa6..497a938 100644
--- a/compiler/main/SysTools.lhs
+++ b/compiler/main/SysTools.lhs
@@ -822,14 +822,15 @@ getBaseDir :: IO (Maybe String)
 #if defined(mingw32_HOST_OS)
 -- Assuming we are running ghc, accessed by path  $(stuff)/bin/ghc.exe,
 -- return the path $(stuff)/lib.
-getBaseDir = do let len = (2048::Int) -- plenty, PATH_MAX is 512 under Win32.
-                buf <- mallocArray len
-                ret <- getModuleFileName nullPtr buf len
-                if ret == 0 then free buf >> return Nothing
-                            else do s <- peekCString buf
-                                    free buf
-                                    return (Just (rootDir s))
+getBaseDir = try_size 2048 -- plenty, PATH_MAX is 512 under Win32.
   where
+    try_size size = allocaArray (fromIntegral size) $ \buf -> do
+        ret <- c_GetModuleFileName nullPtr buf size
+        case ret of
+          0 -> return Nothing
+          _ | ret < size -> fmap (Just . rootDir) $ peekCWString buf
+            | otherwise  -> try_size (size * 2)
+    
     rootDir s = case splitFileName $ normalise s of
                 (d, ghc_exe)
                  | lower ghc_exe `elem` ["ghc.exe",
@@ -844,8 +845,8 @@ getBaseDir = do let len = (2048::Int) -- plenty, PATH_MAX is 512 under Win32.
         where fail = panic ("can't decompose ghc.exe path: " ++ show s)
               lower = map toLower
 
-foreign import stdcall unsafe "GetModuleFileNameA"
-  getModuleFileName :: Ptr () -> CString -> Int -> IO Int32
+foreign import stdcall unsafe "windows.h GetModuleFileNameW"
+  c_GetModuleFileName :: Ptr () -> CWString -> Word32 -> IO Word32
 #else
 getBaseDir = return Nothing
 #endif
diff --git a/compiler/nativeGen/AsmCodeGen.lhs b/compiler/nativeGen/AsmCodeGen.lhs
index 07acbbb..57faa6f 100644
--- a/compiler/nativeGen/AsmCodeGen.lhs
+++ b/compiler/nativeGen/AsmCodeGen.lhs
@@ -50,6 +50,7 @@ import qualified RegAlloc.Graph.TrivColorable	as Color
 
 import TargetReg
 import Platform
+import Config
 import Instruction
 import PIC
 import Reg
@@ -68,7 +69,6 @@ import UniqSupply
 import DynFlags
 import StaticFlags
 import Util
-import Config
 
 import Digraph
 import qualified Pretty
@@ -451,14 +451,12 @@ makeImportsDoc dflags imports
                 -- stack so add the note in:
             Pretty.$$ Pretty.text ".section .note.GNU-stack,\"\",@progbits"
 #endif
-#if !defined(darwin_TARGET_OS)
                 -- And just because every other compiler does, lets stick in
 		-- an identifier directive: .ident "GHC x.y.z"
-	    Pretty.$$ let compilerIdent = Pretty.text "GHC" Pretty.<+>
+            Pretty.$$ let compilerIdent = Pretty.text "GHC" Pretty.<+>
 	                                  Pretty.text cProjectVersion
                        in Pretty.text ".ident" Pretty.<+>
                           Pretty.doubleQuotes compilerIdent
-#endif
 
  where
 	-- Generate "symbol stubs" for all external symbols that might
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index bf34ee7..a55a631 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -68,7 +68,7 @@ import UniqFM
 import DynFlags
 import Module
 import Ctype
-import BasicTypes	( InlineSpec(..), RuleMatchInfo(..) )
+import BasicTypes	( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
 import Util		( readRational )
 
 import Control.Monad
@@ -536,14 +536,14 @@ data Token
   | ITchar       Char
   | ITstring     FastString
   | ITinteger    Integer
-  | ITrational   Rational
+  | ITrational   FractionalLit
 
   | ITprimchar   Char
   | ITprimstring FastString
   | ITprimint    Integer
   | ITprimword   Integer
-  | ITprimfloat  Rational
-  | ITprimdouble Rational
+  | ITprimfloat  FractionalLit
+  | ITprimdouble FractionalLit
 
   -- Template Haskell extension tokens
   | ITopenExpQuote  		--  [| or [e|
@@ -1056,9 +1056,12 @@ hexadecimal = (16,hexDigit)
 
 -- readRational can understand negative rationals, exponents, everything.
 tok_float, tok_primfloat, tok_primdouble :: String -> Token
-tok_float        str = ITrational   $! readRational str
-tok_primfloat    str = ITprimfloat  $! readRational str
-tok_primdouble   str = ITprimdouble $! readRational str
+tok_float        str = ITrational   $! readFractionalLit str
+tok_primfloat    str = ITprimfloat  $! readFractionalLit str
+tok_primdouble   str = ITprimdouble $! readFractionalLit str
+
+readFractionalLit :: String -> FractionalLit
+readFractionalLit str = (FL $! str) $! readRational str
 
 -- -----------------------------------------------------------------------------
 -- Layout processing
diff --git a/compiler/typecheck/TcHsSyn.lhs b/compiler/typecheck/TcHsSyn.lhs
index cd2cadf..12b50ac 100644
--- a/compiler/typecheck/TcHsSyn.lhs
+++ b/compiler/typecheck/TcHsSyn.lhs
@@ -121,7 +121,7 @@ shortCutLit (HsIntegral i) ty
   | isIntTy ty && inIntRange i   = Just (HsLit (HsInt i))
   | isWordTy ty && inWordRange i = Just (mkLit wordDataCon (HsWordPrim i))
   | isIntegerTy ty 	       	 = Just (HsLit (HsInteger i ty))
-  | otherwise		       	 = shortCutLit (HsFractional (fromInteger i)) ty
+  | otherwise		       	 = shortCutLit (HsFractional (integralFractionalLit i)) ty
 	-- The 'otherwise' case is important
 	-- Consider (3 :: Float).  Syntactically it looks like an IntLit,
 	-- so we'll call shortCutIntLit, but of course it's a float
diff --git a/compiler/typecheck/TcSplice.lhs b/compiler/typecheck/TcSplice.lhs
index d6517a6..3cc2eb5 100644
--- a/compiler/typecheck/TcSplice.lhs
+++ b/compiler/typecheck/TcSplice.lhs
@@ -1204,7 +1204,7 @@ reifyClassInstance i
 reifyType :: TypeRep.Type -> TcM TH.Type
 -- Monadic only because of failure
 reifyType ty@(ForAllTy _ _)        = reify_for_all ty
-reifyType ty@(PredTy {} `FunTy` _) = reify_for_all ty	        -- Types like ((?x::Int) => Char -> Char)
+reifyType ty@(PredTy {} `FunTy` _) = reify_for_all ty  -- Types like ((?x::Int) => Char -> Char)
 reifyType (TyVarTy tv)	    = return (TH.VarT (reifyName tv))
 reifyType (TyConApp tc tys) = reify_tc_app tc tys   -- Do not expand type synonyms here
 reifyType (AppTy t1 t2)     = do { [r1,r2] <- reifyTypes [t1,t2] ; return (r1 `TH.AppT` r2) }
diff --git a/compiler/vectorise/Vectorise/Builtins/Base.hs b/compiler/vectorise/Vectorise/Builtins/Base.hs
index 165dbda..8456d34 100644
--- a/compiler/vectorise/Vectorise/Builtins/Base.hs
+++ b/compiler/vectorise/Vectorise/Builtins/Base.hs
@@ -13,7 +13,7 @@ module Vectorise.Builtins.Base (
 	indexBuiltin,
 	
 	-- * Projections
-	selTy,
+        selTy,
 	selReplicate,
 	selPick,
 	selTags,
diff --git a/compiler/vectorise/Vectorise/Builtins/Initialise.hs b/compiler/vectorise/Vectorise/Builtins/Initialise.hs
index ecb8a98..5a6cf88 100644
--- a/compiler/vectorise/Vectorise/Builtins/Initialise.hs
+++ b/compiler/vectorise/Vectorise/Builtins/Initialise.hs
@@ -40,26 +40,62 @@ initBuiltins
 initBuiltins pkg
  = do mapM_ load dph_Orphans
 
-      -- From dph-common:Data.Array.Parallel.Lifted.PArray
-      parrayTyCon	<- externalTyCon	dph_PArray	(fsLit "PArray")
-      let [parrayDataCon] = tyConDataCons parrayTyCon
+      -- From dph-common:Data.Array.Parallel.PArray.PData
+      --     PData is a type family that maps an element type onto the type
+      --     we use to hold an array of those elements.
+      pdataTyCon	<- externalTyCon	dph_PArray_PData  (fsLit "PData")
+
+      --     PR is a type class that holds the primitive operators we can 
+      --     apply to array data. Its functions take arrays in terms of PData types.
+      prClass           <- externalClass        dph_PArray_PData  (fsLit "PR")
+      let prTyCon     = classTyCon prClass
+          [prDataCon] = tyConDataCons prTyCon
 
-      pdataTyCon	<- externalTyCon	dph_PArray	(fsLit "PData")
-      paClass           <- externalClass        dph_PArray      (fsLit "PA")
+
+      -- From dph-common:Data.Array.Parallel.PArray.PRepr
+      preprTyCon	<- externalTyCon 	dph_PArray_PRepr  (fsLit "PRepr")
+      paClass           <- externalClass        dph_PArray_PRepr  (fsLit "PA")
       let paTyCon     = classTyCon paClass
           [paDataCon] = tyConDataCons paTyCon
           paPRSel     = classSCSelId paClass 0
 
-      preprTyCon	<- externalTyCon 	dph_PArray	(fsLit "PRepr")
-      prClass           <- externalClass        dph_PArray      (fsLit "PR")
-      let prTyCon     = classTyCon prClass
-          [prDataCon] = tyConDataCons prTyCon
+      replicatePDVar    <- externalVar          dph_PArray_PRepr  (fsLit "replicatePD")
+      emptyPDVar        <- externalVar          dph_PArray_PRepr  (fsLit "emptyPD")
+      packByTagPDVar    <- externalVar          dph_PArray_PRepr  (fsLit "packByTagPD")
+      combines 		<- mapM (externalVar dph_PArray_PRepr)
+                       		[mkFastString ("combine" ++ show i ++ "PD")
+                          		| i <- [2..mAX_DPH_COMBINE]]
+
+      let combinePDVars = listArray (2, mAX_DPH_COMBINE) combines
+
 
-      closureTyCon	<- externalTyCon dph_Closure		(fsLit ":->")
+      -- From dph-common:Data.Array.Parallel.PArray.Scalar
+      --     Scalar is the class of scalar values. 
+      --     The dictionary contains functions to coerce U.Arrays of scalars
+      --     to and from the PData representation.
+      scalarClass 	<- externalClass        dph_PArray_Scalar (fsLit "Scalar")
+
+
+      -- From dph-common:Data.Array.Parallel.Lifted.PArray
+      --   A PArray (Parallel Array) holds the array length and some array elements
+      --   represented by the PData type family.
+      parrayTyCon	<- externalTyCon	dph_PArray_Base	  (fsLit "PArray")
+      let [parrayDataCon] = tyConDataCons parrayTyCon
+
+      -- From dph-common:Data.Array.Parallel.PArray.Types
+      voidTyCon		<- externalTyCon        dph_PArray_Types  (fsLit "Void")
+      voidVar           <- externalVar          dph_PArray_Types  (fsLit "void")
+      fromVoidVar       <- externalVar          dph_PArray_Types  (fsLit "fromVoid")
+      wrapTyCon		<- externalTyCon        dph_PArray_Types  (fsLit "Wrap")
+      sum_tcs		<- mapM (externalTyCon  dph_PArray_Types) (numbered "Sum" 2 mAX_DPH_SUM)
+
+      -- from dph-common:Data.Array.Parallel.PArray.PDataInstances
+      pvoidVar          <- externalVar dph_PArray_PDataInstances  (fsLit "pvoid")
+      punitVar          <- externalVar dph_PArray_PDataInstances  (fsLit "punit")
+
+
+      closureTyCon	<- externalTyCon dph_Closure		 (fsLit ":->")
 
-      -- From dph-common:Data.Array.Parallel.Lifted.Repr
-      voidTyCon		<- externalTyCon	dph_Repr	(fsLit "Void")
-      wrapTyCon		<- externalTyCon	dph_Repr	(fsLit "Wrap")
 
       -- From dph-common:Data.Array.Parallel.Lifted.Unboxed
       sel_tys		<- mapM (externalType dph_Unboxed)
@@ -77,8 +113,6 @@ initBuiltins pkg
       sel_els		<- mapM mk_elements
 				[(i,j) | i <- [2..mAX_DPH_SUM], j <- [0..i-1]]
 
-      sum_tcs		<- mapM (externalTyCon dph_Repr)
-				(numbered "Sum" 2 mAX_DPH_SUM)
 
       let selTys        = listArray (2, mAX_DPH_SUM) sel_tys
           selReplicates = listArray (2, mAX_DPH_SUM) sel_replicates
@@ -88,26 +122,14 @@ initBuiltins pkg
           sumTyCons     = listArray (2, mAX_DPH_SUM) sum_tcs
 
 
-      voidVar          <- externalVar dph_Repr		(fsLit "void")
-      pvoidVar         <- externalVar dph_Repr		(fsLit "pvoid")
-      fromVoidVar      <- externalVar dph_Repr		(fsLit "fromVoid")
-      punitVar         <- externalVar dph_Repr		(fsLit "punit")
+
       closureVar       <- externalVar dph_Closure	(fsLit "closure")
       applyVar         <- externalVar dph_Closure	(fsLit "$:")
       liftedClosureVar <- externalVar dph_Closure	(fsLit "liftedClosure")
       liftedApplyVar   <- externalVar dph_Closure	(fsLit "liftedApply")
-      replicatePDVar   <- externalVar dph_PArray	(fsLit "replicatePD")
-      emptyPDVar       <- externalVar dph_PArray	(fsLit "emptyPD")
-      packByTagPDVar   <- externalVar dph_PArray	(fsLit "packByTagPD")
-
-      combines 		<- mapM (externalVar dph_PArray)
-                       		[mkFastString ("combine" ++ show i ++ "PD")
-                          		| i <- [2..mAX_DPH_COMBINE]]
-      let combinePDVars = listArray (2, mAX_DPH_COMBINE) combines
 
-      scalarClass 	<- externalClass dph_PArray	(fsLit "Scalar")
       scalar_map	<- externalVar	dph_Scalar	(fsLit "scalar_map")
-      scalar_zip2	<- externalVar	dph_Scalar	(fsLit "scalar_zipWith")
+      scalar_zip2   <- externalVar	dph_Scalar	(fsLit "scalar_zipWith")
       scalar_zips	<- mapM (externalVar dph_Scalar)
                           	(numbered "scalar_zipWith" 3 mAX_DPH_SCALAR_ARGS)
 
@@ -162,13 +184,20 @@ initBuiltins pkg
                , liftingContext   = liftingContext
                }
   where
-    mods@(Modules {
-               dph_PArray         = dph_PArray
-             , dph_Repr           = dph_Repr
-             , dph_Closure        = dph_Closure
-             , dph_Scalar         = dph_Scalar
-             , dph_Unboxed        = dph_Unboxed
-             })
+    -- Extract out all the modules we'll use.
+    -- These are the modules from the DPH base library that contain
+    --  the primitive array types and functions that vectorised code uses.
+    mods@(Modules 
+                { dph_PArray_Base               = dph_PArray_Base
+                , dph_PArray_Scalar             = dph_PArray_Scalar
+                , dph_PArray_PRepr              = dph_PArray_PRepr
+                , dph_PArray_PData              = dph_PArray_PData
+                , dph_PArray_PDataInstances     = dph_PArray_PDataInstances
+                , dph_PArray_Types              = dph_PArray_Types
+                , dph_Closure                   = dph_Closure
+                , dph_Scalar                    = dph_Scalar
+                , dph_Unboxed                   = dph_Unboxed
+                })
       = dph_Modules pkg
 
     load get_mod = dsLoadModule doc mod
@@ -248,13 +277,13 @@ initBuiltinDataCons _
 -- | Get the names of all buildin instance functions for the PA class.
 initBuiltinPAs :: Builtins -> (InstEnv, InstEnv) -> DsM [(Name, Var)]
 initBuiltinPAs (Builtins { dphModules = mods }) insts
-  = liftM (initBuiltinDicts insts) (externalClass (dph_PArray mods) (fsLit "PA"))
+  = liftM (initBuiltinDicts insts) (externalClass (dph_PArray_PRepr mods) (fsLit "PA"))
 
 
 -- | Get the names of all builtin instance functions for the PR class.
 initBuiltinPRs :: Builtins -> (InstEnv, InstEnv) -> DsM [(Name, Var)]
 initBuiltinPRs (Builtins { dphModules = mods }) insts
-  = liftM (initBuiltinDicts insts) (externalClass (dph_PArray mods) (fsLit "PR"))
+  = liftM (initBuiltinDicts insts) (externalClass (dph_PArray_PData mods) (fsLit "PR"))
 
 
 -- | Get the names of all DPH instance functions for this class.
diff --git a/compiler/vectorise/Vectorise/Builtins/Modules.hs b/compiler/vectorise/Vectorise/Builtins/Modules.hs
index d5b10cb..6ea3595 100644
--- a/compiler/vectorise/Vectorise/Builtins/Modules.hs
+++ b/compiler/vectorise/Vectorise/Builtins/Modules.hs
@@ -10,45 +10,61 @@ import FastString
 	
 -- | Ids of the modules that contain our DPH builtins.
 data Modules 
-	= Modules 
-	{ dph_PArray		:: Module
-        , dph_Repr		:: Module
-        , dph_Closure		:: Module
-        , dph_Unboxed		:: Module
-        , dph_Instances		:: Module
-        , dph_Combinators	:: Module
-        , dph_Scalar		:: Module
-        , dph_Prelude_PArr	:: Module
-        , dph_Prelude_Int	:: Module
-        , dph_Prelude_Word8	:: Module
-        , dph_Prelude_Double	:: Module
-        , dph_Prelude_Bool	:: Module
-        , dph_Prelude_Tuple	:: Module
-	}
+  = Modules 
+  { dph_PArray_Base             :: Module
+  , dph_PArray_Scalar           :: Module
+  , dph_PArray_ScalarInstances  :: Module
+  , dph_PArray_PRepr            :: Module
+  , dph_PArray_PReprInstances   :: Module
+  , dph_PArray_PData            :: Module
+  , dph_PArray_PDataInstances   :: Module
+  , dph_PArray_Types            :: Module
+	
+  , dph_Closure	                :: Module
+  , dph_Unboxed	                :: Module
+  , dph_Combinators             :: Module
+  , dph_Scalar	                :: Module
+
+  , dph_Prelude_Int             :: Module
+  , dph_Prelude_Word8           :: Module
+  , dph_Prelude_Double          :: Module
+  , dph_Prelude_Bool            :: Module
+  , dph_Prelude_Tuple           :: Module
+  }
 
 
 -- | The locations of builtins in the current DPH library.
 dph_Modules :: PackageId -> Modules
 dph_Modules pkg 
-	= Modules 
-	{ dph_PArray         = mk (fsLit "Data.Array.Parallel.Lifted.PArray")
-	, dph_Repr           = mk (fsLit "Data.Array.Parallel.Lifted.Repr")
-	, dph_Closure        = mk (fsLit "Data.Array.Parallel.Lifted.Closure")
-	, dph_Unboxed        = mk (fsLit "Data.Array.Parallel.Lifted.Unboxed")
-	, dph_Instances      = mk (fsLit "Data.Array.Parallel.Lifted.Instances")
-	, dph_Combinators    = mk (fsLit "Data.Array.Parallel.Lifted.Combinators")
-	, dph_Scalar         = mk (fsLit "Data.Array.Parallel.Lifted.Scalar")
+  = Modules 
+  { dph_PArray_Base             = mk (fsLit "Data.Array.Parallel.PArray.Base")
+  , dph_PArray_Scalar           = mk (fsLit "Data.Array.Parallel.PArray.Scalar")
+  , dph_PArray_ScalarInstances  = mk (fsLit "Data.Array.Parallel.PArray.ScalarInstances")
+  , dph_PArray_PRepr            = mk (fsLit "Data.Array.Parallel.PArray.PRepr")
+  , dph_PArray_PReprInstances   = mk (fsLit "Data.Array.Parallel.PArray.PReprInstances")
+  , dph_PArray_PData            = mk (fsLit "Data.Array.Parallel.PArray.PData")
+  , dph_PArray_PDataInstances   = mk (fsLit "Data.Array.Parallel.PArray.PDataInstances")
+  , dph_PArray_Types            = mk (fsLit "Data.Array.Parallel.PArray.Types")
+	
+  , dph_Closure                 = mk (fsLit "Data.Array.Parallel.Lifted.Closure")
+  , dph_Unboxed                 = mk (fsLit "Data.Array.Parallel.Lifted.Unboxed")
+  , dph_Combinators             = mk (fsLit "Data.Array.Parallel.Lifted.Combinators")
+  , dph_Scalar                  = mk (fsLit "Data.Array.Parallel.Lifted.Scalar")
 
-	, dph_Prelude_PArr   = mk (fsLit "Data.Array.Parallel.Prelude.Base.PArr")
-	, dph_Prelude_Int    = mk (fsLit "Data.Array.Parallel.Prelude.Base.Int")
-	, dph_Prelude_Word8  = mk (fsLit "Data.Array.Parallel.Prelude.Base.Word8")
-	, dph_Prelude_Double = mk (fsLit "Data.Array.Parallel.Prelude.Base.Double")
-	, dph_Prelude_Bool   = mk (fsLit "Data.Array.Parallel.Prelude.Base.Bool")
-	, dph_Prelude_Tuple  = mk (fsLit "Data.Array.Parallel.Prelude.Base.Tuple")
-	}
-	where	mk = mkModule pkg . mkModuleNameFS
+  , dph_Prelude_Int             = mk (fsLit "Data.Array.Parallel.Prelude.Base.Int")
+  , dph_Prelude_Word8           = mk (fsLit "Data.Array.Parallel.Prelude.Base.Word8")
+  , dph_Prelude_Double          = mk (fsLit "Data.Array.Parallel.Prelude.Base.Double")
+  , dph_Prelude_Bool            = mk (fsLit "Data.Array.Parallel.Prelude.Base.Bool")
+  , dph_Prelude_Tuple           = mk (fsLit "Data.Array.Parallel.Prelude.Base.Tuple")
+  }
+  where	mk = mkModule pkg . mkModuleNameFS
 
 
--- | Project out ids of modules that contain orphan instances that we need to load.
 dph_Orphans :: [Modules -> Module]
-dph_Orphans = [dph_Repr, dph_Instances]
+dph_Orphans
+ = [ dph_PArray_Scalar
+   , dph_PArray_ScalarInstances
+   , dph_PArray_PReprInstances
+   , dph_PArray_PDataInstances
+   , dph_Scalar
+   ]
diff --git a/compiler/vectorise/Vectorise/Builtins/Prelude.hs b/compiler/vectorise/Vectorise/Builtins/Prelude.hs
index b0f305d..51b3d14 100644
--- a/compiler/vectorise/Vectorise/Builtins/Prelude.hs
+++ b/compiler/vectorise/Vectorise/Builtins/Prelude.hs
@@ -25,36 +25,18 @@ preludeVars :: Modules
 	-> [( Module, FastString	--   Maps the original variable to the one in the DPH 
 	    , Module, FastString)]      --   packages that it should be rewritten to.
 preludeVars (Modules { dph_Combinators    = _dph_Combinators
-                     , dph_PArray         = _dph_PArray
                      , dph_Prelude_Int    = dph_Prelude_Int
                      , dph_Prelude_Word8  = dph_Prelude_Word8
                      , dph_Prelude_Double = dph_Prelude_Double
                      , dph_Prelude_Bool   = dph_Prelude_Bool 
-                     , dph_Prelude_PArr   = _dph_Prelude_PArr
                      })
 
-    -- Functions that work on whole PArrays, defined in GHC.PArr
-  = [ {- mk gHC_PARR' (fsLit "mapP")       dph_Combinators (fsLit "mapPA")
-    , mk gHC_PARR' (fsLit "zipWithP")   dph_Combinators (fsLit "zipWithPA")
-    , mk gHC_PARR' (fsLit "zipP")       dph_Combinators (fsLit "zipPA")
-    , mk gHC_PARR' (fsLit "unzipP")     dph_Combinators (fsLit "unzipPA")
-    , mk gHC_PARR' (fsLit "filterP")    dph_Combinators (fsLit "filterPA")
-    , mk gHC_PARR' (fsLit "lengthP")    dph_Combinators (fsLit "lengthPA")
-    , mk gHC_PARR' (fsLit "replicateP") dph_Combinators (fsLit "replicatePA")
-    , mk gHC_PARR' (fsLit "!:")         dph_Combinators (fsLit "indexPA")
-    , mk gHC_PARR' (fsLit "sliceP")     dph_Combinators (fsLit "slicePA")
-    , mk gHC_PARR' (fsLit "crossMapP")  dph_Combinators (fsLit "crossMapPA")
-    , mk gHC_PARR' (fsLit "singletonP") dph_Combinators (fsLit "singletonPA")
-    , mk gHC_PARR' (fsLit "concatP")    dph_Combinators (fsLit "concatPA")
-    , mk gHC_PARR' (fsLit "+:+")        dph_Combinators (fsLit "appPA")
-    , mk gHC_PARR' (fsLit "emptyP")     dph_PArray      (fsLit "emptyPA")
-
+  = [ 
     -- Map scalar functions to versions using closures. 
-    , -} mk' dph_Prelude_Int "div"         "divV"
+      mk' dph_Prelude_Int "div"         "divV"
     , mk' dph_Prelude_Int "mod"         "modV"
     , mk' dph_Prelude_Int "sqrt"        "sqrtV"
     , mk' dph_Prelude_Int "enumFromToP" "enumFromToPA"
-    -- , mk' dph_Prelude_Int "upToP" "upToPA"
     ]
     ++ vars_Ord dph_Prelude_Int
     ++ vars_Num dph_Prelude_Int
@@ -80,17 +62,7 @@ preludeVars (Modules { dph_Combinators    = _dph_Combinators
     , mk gHC_CLASSES (fsLit "not")         dph_Prelude_Bool (fsLit "notV")
     , mk gHC_CLASSES (fsLit "&&")          dph_Prelude_Bool (fsLit "andV")
     , mk gHC_CLASSES (fsLit "||")          dph_Prelude_Bool (fsLit "orV")
-
-{-
-    -- FIXME: temporary
-    , mk dph_Prelude_PArr (fsLit "fromPArrayP")       dph_Prelude_PArr (fsLit "fromPArrayPA")
-    , mk dph_Prelude_PArr (fsLit "toPArrayP")         dph_Prelude_PArr (fsLit "toPArrayPA")
-    , mk dph_Prelude_PArr (fsLit "fromNestedPArrayP") dph_Prelude_PArr (fsLit "fromNestedPArrayPA")
-    , mk dph_Prelude_PArr (fsLit "combineP")          dph_Combinators  (fsLit "combine2PA")
-    , mk dph_Prelude_PArr (fsLit "updateP")           dph_Combinators  (fsLit "updatePA")
-    , mk dph_Prelude_PArr (fsLit "bpermuteP")         dph_Combinators  (fsLit "bpermutePA")
-    , mk dph_Prelude_PArr (fsLit "indexedP")          dph_Combinators  (fsLit "indexedPA")
--}    ]
+    ]
   where
     mk  = (,,,)
     mk' mod v v' = mk mod (fsLit v) mod (fsLit v')
diff --git a/includes/Rts.h b/includes/Rts.h
index 51351fa..3a6c6f2 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -219,6 +219,12 @@ DLL_IMPORT_RTS extern char **prog_argv;	/* so we can get at these from Haskell *
 DLL_IMPORT_RTS extern int    prog_argc;
 DLL_IMPORT_RTS extern char  *prog_name;
 
+#ifdef mingw32_HOST_OS
+// We need these two from Haskell too
+void getWin32ProgArgv(int *argc, wchar_t **argv[]);
+void setWin32ProgArgv(int argc, wchar_t *argv[]);
+#endif
+
 void stackOverflow(void);
 
 void stg_exit(int n) GNU_ATTRIBUTE(__noreturn__);
diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h
index cd98666..6b1d319 100644
--- a/includes/stg/MachRegs.h
+++ b/includes/stg/MachRegs.h
@@ -67,6 +67,11 @@
    Caller-saves regs have to be saved around C-calls made from STG
    land, so this file defines CALLER_SAVES_<reg> for each <reg> that
    is designated caller-saves in that machine's C calling convention.
+
+   As it stands, the only registers that are ever marked caller saves
+   are the RX, FX, DX and USER registers; as a result, if you
+   decide to caller save a system register (e.g. SP, HP, etc), note that
+   this code path is completely untested! -- EZY
    -------------------------------------------------------------------------- */
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/Linker.c b/rts/Linker.c
index c1310b0..28ba9a0 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -387,6 +387,8 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_asyncReadzh)                     \
       SymI_HasProto(stg_asyncWritezh)                    \
       SymI_HasProto(stg_asyncDoProczh)                   \
+      SymI_HasProto(getWin32ProgArgv)                    \
+      SymI_HasProto(setWin32ProgArgv)                    \
       SymI_HasProto(memset)                              \
       SymI_HasProto(inet_ntoa)                           \
       SymI_HasProto(inet_addr)                           \
@@ -2335,6 +2337,7 @@ unloadObj( char *path )
             //  stgFree(oc->image);
             // #endif
             stgFree(oc->fileName);
+            stgFree(oc->archiveMemberName);
             stgFree(oc->symbols);
             stgFree(oc->sections);
             stgFree(oc);
@@ -3680,31 +3683,6 @@ PLTSize(void)
  * Generic ELF functions
  */
 
-static char *
-findElfSection ( void* objImage, Elf_Word sh_type )
-{
-   char* ehdrC = (char*)objImage;
-   Elf_Ehdr* ehdr = (Elf_Ehdr*)ehdrC;
-   Elf_Shdr* shdr = (Elf_Shdr*)(ehdrC + ehdr->e_shoff);
-   char* sh_strtab = ehdrC + shdr[ehdr->e_shstrndx].sh_offset;
-   char* ptr = NULL;
-   int i;
-
-   for (i = 0; i < ehdr->e_shnum; i++) {
-      if (shdr[i].sh_type == sh_type
-          /* Ignore the section header's string table. */
-          && i != ehdr->e_shstrndx
-          /* Ignore string tables named .stabstr, as they contain
-             debugging info. */
-          && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
-         ) {
-         ptr = ehdrC + shdr[i].sh_offset;
-         break;
-      }
-   }
-   return ptr;
-}
-
 static int
 ocVerifyImage_ELF ( ObjectCode* oc )
 {
@@ -3712,7 +3690,6 @@ ocVerifyImage_ELF ( ObjectCode* oc )
    Elf_Sym*  stab;
    int i, j, nent, nstrtab, nsymtabs;
    char* sh_strtab;
-   char* strtab;
 
    char*     ehdrC = (char*)(oc->image);
    Elf_Ehdr* ehdr  = (Elf_Ehdr*)ehdrC;
@@ -3794,20 +3771,64 @@ ocVerifyImage_ELF ( ObjectCode* oc )
                ehdrC + shdr[i].sh_offset,
                       ehdrC + shdr[i].sh_offset + shdr[i].sh_size - 1));
 
-      if (shdr[i].sh_type == SHT_REL) {
-          IF_DEBUG(linker,debugBelch("Rel  " ));
-      } else if (shdr[i].sh_type == SHT_RELA) {
-          IF_DEBUG(linker,debugBelch("RelA " ));
-      } else {
-          IF_DEBUG(linker,debugBelch("     "));
+#define SECTION_INDEX_VALID(ndx) (ndx > SHN_UNDEF && ndx < ehdr->e_shnum)
+
+      switch (shdr[i].sh_type) {
+
+        case SHT_REL:
+        case SHT_RELA:
+          IF_DEBUG(linker,debugBelch( shdr[i].sh_type == SHT_REL ? "Rel  " : "RelA "));
+
+          if (!SECTION_INDEX_VALID(shdr[i].sh_link)) {
+            if (shdr[i].sh_link == SHN_UNDEF)
+              errorBelch("\n%s: relocation section #%d has no symbol table\n"
+                         "This object file has probably been fully striped. "
+                         "Such files cannot be linked.\n",
+                         oc->archiveMemberName ? oc->archiveMemberName : oc->fileName, i);
+            else
+              errorBelch("\n%s: relocation section #%d has an invalid link field (%d)\n",
+                         oc->archiveMemberName ? oc->archiveMemberName : oc->fileName,
+                         i, shdr[i].sh_link);
+            return 0;
+          }
+          if (shdr[shdr[i].sh_link].sh_type != SHT_SYMTAB) {
+            errorBelch("\n%s: relocation section #%d does not link to a symbol table\n",
+                       oc->archiveMemberName ? oc->archiveMemberName : oc->fileName, i);
+            return 0;
+          }
+          if (!SECTION_INDEX_VALID(shdr[i].sh_info)) {
+            errorBelch("\n%s: relocation section #%d has an invalid info field (%d)\n",
+                       oc->archiveMemberName ? oc->archiveMemberName : oc->fileName,
+                       i, shdr[i].sh_info);
+            return 0;
+          }
+
+          break;
+        case SHT_SYMTAB:
+          IF_DEBUG(linker,debugBelch("Sym  "));
+
+          if (!SECTION_INDEX_VALID(shdr[i].sh_link)) {
+            errorBelch("\n%s: symbol table section #%d has an invalid link field (%d)\n",
+                       oc->archiveMemberName ? oc->archiveMemberName : oc->fileName,
+                       i, shdr[i].sh_link);
+            return 0;
+          }
+          if (shdr[shdr[i].sh_link].sh_type != SHT_STRTAB) {
+            errorBelch("\n%s: symbol table section #%d does not link to a string table\n",
+                       oc->archiveMemberName ? oc->archiveMemberName : oc->fileName, i);
+
+            return 0;
+          }
+          break;
+        case SHT_STRTAB: IF_DEBUG(linker,debugBelch("Str  ")); break;
+        default:         IF_DEBUG(linker,debugBelch("     ")); break;
       }
       if (sh_strtab) {
           IF_DEBUG(linker,debugBelch("sname=%s\n", sh_strtab + shdr[i].sh_name ));
       }
    }
 
-   IF_DEBUG(linker,debugBelch( "\nString tables" ));
-   strtab = NULL;
+   IF_DEBUG(linker,debugBelch( "\nString tables\n" ));
    nstrtab = 0;
    for (i = 0; i < ehdr->e_shnum; i++) {
       if (shdr[i].sh_type == SHT_STRTAB
@@ -3817,18 +3838,16 @@ ocVerifyImage_ELF ( ObjectCode* oc )
              debugging info. */
           && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
          ) {
-         IF_DEBUG(linker,debugBelch("   section %d is a normal string table", i ));
-         strtab = ehdrC + shdr[i].sh_offset;
+         IF_DEBUG(linker,debugBelch("   section %d is a normal string table\n", i ));
          nstrtab++;
       }
    }
-   if (nstrtab != 1) {
-      errorBelch("%s: no string tables, or too many", oc->fileName);
-      return 0;
+   if (nstrtab == 0) {
+      IF_DEBUG(linker,debugBelch("   no normal string tables (potentially, but not necessarily a problem)\n"));
    }
 
    nsymtabs = 0;
-   IF_DEBUG(linker,debugBelch( "\nSymbol tables" ));
+   IF_DEBUG(linker,debugBelch( "Symbol tables\n" ));
    for (i = 0; i < ehdr->e_shnum; i++) {
       if (shdr[i].sh_type != SHT_SYMTAB) continue;
       IF_DEBUG(linker,debugBelch( "section %d is a symbol table\n", i ));
@@ -3870,13 +3889,17 @@ ocVerifyImage_ELF ( ObjectCode* oc )
          }
          IF_DEBUG(linker,debugBelch("  " ));
 
-         IF_DEBUG(linker,debugBelch("name=%s\n", strtab + stab[j].st_name ));
+         IF_DEBUG(linker,debugBelch("name=%s\n",
+                        ehdrC + shdr[shdr[i].sh_link].sh_offset
+                              + stab[j].st_name ));
       }
    }
 
    if (nsymtabs == 0) {
-      errorBelch("%s: didn't find any symbol tables", oc->fileName);
-      return 0;
+     // Not having a symbol table is not in principle a problem.
+     // When an object file has no symbols then the 'strip' program
+     // typically will remove the symbol table entirely.
+     IF_DEBUG(linker,debugBelch("   no symbol tables (potentially, but not necessarily a problem)\n"));
    }
 
    return 1;
@@ -3923,16 +3946,11 @@ ocGetNames_ELF ( ObjectCode* oc )
 
    char*     ehdrC    = (char*)(oc->image);
    Elf_Ehdr* ehdr     = (Elf_Ehdr*)ehdrC;
-   char*     strtab   = findElfSection ( ehdrC, SHT_STRTAB );
+   char*     strtab;
    Elf_Shdr* shdr     = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
 
    ASSERT(symhash != NULL);
 
-   if (!strtab) {
-      errorBelch("%s: no strtab", oc->fileName);
-      return 0;
-   }
-
    k = 0;
    for (i = 0; i < ehdr->e_shnum; i++) {
       /* Figure out what kind of section it is.  Logic derived from
@@ -3965,12 +3983,16 @@ ocGetNames_ELF ( ObjectCode* oc )
 
       /* copy stuff into this module's object symbol table */
       stab = (Elf_Sym*) (ehdrC + shdr[i].sh_offset);
+      strtab = ehdrC + shdr[shdr[i].sh_link].sh_offset;
       nent = shdr[i].sh_size / sizeof(Elf_Sym);
 
       oc->n_symbols = nent;
       oc->symbols = stgMallocBytes(oc->n_symbols * sizeof(char*),
                                    "ocGetNames_ELF(oc->symbols)");
 
+      //TODO: we ignore local symbols anyway right? So we can use the
+      //      shdr[i].sh_info to get the index of the first non-local symbol
+      // ie we should use j = shdr[i].sh_info
       for (j = 0; j < nent; j++) {
 
          char  isLocal = FALSE; /* avoids uninit-var warning */
@@ -4068,21 +4090,24 @@ ocGetNames_ELF ( ObjectCode* oc )
    relocations appear to be of this form. */
 static int
 do_Elf_Rel_relocations ( ObjectCode* oc, char* ehdrC,
-                         Elf_Shdr* shdr, int shnum,
-                         Elf_Sym*  stab, char* strtab )
+                         Elf_Shdr* shdr, int shnum )
 {
    int j;
    char *symbol;
    Elf_Word* targ;
    Elf_Rel*  rtab = (Elf_Rel*) (ehdrC + shdr[shnum].sh_offset);
+   Elf_Sym*  stab;
+   char*     strtab;
    int         nent = shdr[shnum].sh_size / sizeof(Elf_Rel);
    int target_shndx = shdr[shnum].sh_info;
    int symtab_shndx = shdr[shnum].sh_link;
+   int strtab_shndx = shdr[symtab_shndx].sh_link;
 
    stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   strtab= (char*)    (ehdrC + shdr[ strtab_shndx ].sh_offset);
    targ  = (Elf_Word*)(ehdrC + shdr[ target_shndx ].sh_offset);
-   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
-                          target_shndx, symtab_shndx ));
+   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d and strtab %d\n",
+                          target_shndx, symtab_shndx, strtab_shndx ));
 
    /* Skip sections that we're not interested in. */
    {
@@ -4168,18 +4193,21 @@ do_Elf_Rel_relocations ( ObjectCode* oc, char* ehdrC,
    sparc-solaris relocations appear to be of this form. */
 static int
 do_Elf_Rela_relocations ( ObjectCode* oc, char* ehdrC,
-                          Elf_Shdr* shdr, int shnum,
-                          Elf_Sym*  stab, char* strtab )
+                          Elf_Shdr* shdr, int shnum )
 {
    int j;
    char *symbol = NULL;
    Elf_Addr targ;
    Elf_Rela* rtab = (Elf_Rela*) (ehdrC + shdr[shnum].sh_offset);
+   Elf_Sym*  stab;
+   char*     strtab;
    int         nent = shdr[shnum].sh_size / sizeof(Elf_Rela);
    int target_shndx = shdr[shnum].sh_info;
    int symtab_shndx = shdr[shnum].sh_link;
+   int strtab_shndx = shdr[symtab_shndx].sh_link;
 
    stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   strtab= (char*)    (ehdrC + shdr[ strtab_shndx ].sh_offset);
    targ  = (Elf_Addr) (ehdrC + shdr[ target_shndx ].sh_offset);
    IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
                           target_shndx, symtab_shndx ));
@@ -4448,35 +4476,20 @@ do_Elf_Rela_relocations ( ObjectCode* oc, char* ehdrC,
 static int
 ocResolve_ELF ( ObjectCode* oc )
 {
-   char *strtab;
    int   shnum, ok;
-   Elf_Sym*  stab  = NULL;
    char*     ehdrC = (char*)(oc->image);
    Elf_Ehdr* ehdr  = (Elf_Ehdr*) ehdrC;
    Elf_Shdr* shdr  = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
 
-   /* first find "the" symbol table */
-   stab = (Elf_Sym*) findElfSection ( ehdrC, SHT_SYMTAB );
-
-   /* also go find the string table */
-   strtab = findElfSection ( ehdrC, SHT_STRTAB );
-
-   if (stab == NULL || strtab == NULL) {
-      errorBelch("%s: can't find string or symbol table", oc->fileName);
-      return 0;
-   }
-
    /* Process the relocation sections. */
    for (shnum = 0; shnum < ehdr->e_shnum; shnum++) {
       if (shdr[shnum].sh_type == SHT_REL) {
-         ok = do_Elf_Rel_relocations ( oc, ehdrC, shdr,
-                                       shnum, stab, strtab );
+         ok = do_Elf_Rel_relocations ( oc, ehdrC, shdr, shnum );
          if (!ok) return ok;
       }
       else
       if (shdr[shnum].sh_type == SHT_RELA) {
-         ok = do_Elf_Rela_relocations ( oc, ehdrC, shdr,
-                                        shnum, stab, strtab );
+         ok = do_Elf_Rela_relocations ( oc, ehdrC, shdr, shnum );
          if (!ok) return ok;
       }
    }
@@ -4509,8 +4522,12 @@ static int ocAllocateSymbolExtras_ELF( ObjectCode *oc )
 
   if( i == ehdr->e_shnum )
   {
-    errorBelch( "This ELF file contains no symtab" );
-    return 0;
+    // Not having a symbol table is not in principle a problem.
+    // When an object file has no symbols then the 'strip' program
+    // typically will remove the symbol table entirely.
+    IF_DEBUG(linker, debugBelch( "The ELF file %s contains no symtab\n",
+             oc->archiveMemberName ? oc->archiveMemberName : oc->fileName ));
+    return 1;
   }
 
   if( shdr[i].sh_entsize != sizeof( Elf_Sym ) )
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 408e1c7..1408070 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -34,6 +34,14 @@ char  **full_prog_argv = NULL;
 char   *prog_name = NULL; /* 'basename' of prog_argv[0] */
 int     rts_argc = 0;  /* ditto */
 char   *rts_argv[MAX_RTS_ARGS];
+#if defined(mingw32_HOST_OS)
+// On Windows, we want to use GetCommandLineW rather than argc/argv,
+// but we need to mutate the command line arguments for withProgName and
+// friends. The System.Environment module achieves that using this bit of
+// shared state:
+int       win32_prog_argc = 0;
+wchar_t **win32_prog_argv = NULL;
+#endif
 
 /*
  * constants, used later 
@@ -1536,3 +1544,53 @@ freeFullProgArgv (void)
     full_prog_argc = 0;
     full_prog_argv = NULL;
 }
+
+#if defined(mingw32_HOST_OS)
+void freeWin32ProgArgv (void);
+
+void
+freeWin32ProgArgv (void)
+{
+    int i;
+
+    if (win32_prog_argv != NULL) {
+        for (i = 0; i < win32_prog_argc; i++) {
+            stgFree(win32_prog_argv[i]);
+        }
+        stgFree(win32_prog_argv);
+    }
+
+    win32_prog_argc = 0;
+    win32_prog_argv = NULL;
+}
+
+void
+getWin32ProgArgv(int *argc, wchar_t **argv[])
+{
+    *argc = win32_prog_argc;
+    *argv = win32_prog_argv;
+}
+
+void
+setWin32ProgArgv(int argc, wchar_t *argv[])
+{
+	int i;
+    
+	freeWin32ProgArgv();
+
+    win32_prog_argc = argc;
+	if (argv == NULL) {
+		win32_prog_argv = NULL;
+		return;
+	}
+	
+    win32_prog_argv = stgCallocBytes(argc + 1, sizeof (wchar_t *),
+                                    "setWin32ProgArgv 1");
+    for (i = 0; i < argc; i++) {
+        win32_prog_argv[i] = stgMallocBytes((wcslen(argv[i]) + 1) * sizeof(wchar_t),
+                                           "setWin32ProgArgv 2");
+        wcscpy(win32_prog_argv[i], argv[i]);
+    }
+    win32_prog_argv[argc] = NULL;
+}
+#endif
diff --git a/utils/ghc-pkg/Main.hs b/utils/ghc-pkg/Main.hs
index 1cec56a..74f761b 100644
--- a/utils/ghc-pkg/Main.hs
+++ b/utils/ghc-pkg/Main.hs
@@ -1487,16 +1487,17 @@ getExecDir cmd =
           removeCmdSuffix = return . Just . initN (length cmd) . unDosifyPath
 
 getExecPath :: IO (Maybe String)
-getExecPath =
-     allocaArray len $ \buf -> do
-         ret <- getModuleFileName nullPtr buf len
-         if ret == 0 then return Nothing
-	             else liftM Just $ peekCString buf
-    where len = 2048 -- Plenty, PATH_MAX is 512 under Win32.
-
-foreign import stdcall unsafe "GetModuleFileNameA"
-    getModuleFileName :: Ptr () -> CString -> Int -> IO Int32
-
+getExecPath = try_size 2048 -- plenty, PATH_MAX is 512 under Win32.
+  where
+    try_size size = allocaArray (fromIntegral size) $ \buf -> do
+        ret <- c_GetModuleFileName nullPtr buf size
+        case ret of
+          0 -> return Nothing
+          _ | ret < size -> fmap Just $ peekCWString buf
+            | otherwise  -> try_size (size * 2)
+
+foreign import stdcall unsafe "windows.h GetModuleFileNameW"
+  c_GetModuleFileName :: Ptr () -> CWString -> Word32 -> IO Word32
 #else
 getLibDir :: IO (Maybe String)
 getLibDir = return Nothing
diff --git a/utils/runghc/runghc.hs b/utils/runghc/runghc.hs
index ab49513..4424c96 100644
--- a/utils/runghc/runghc.hs
+++ b/utils/runghc/runghc.hs
@@ -149,15 +149,17 @@ dieProg msg = do
 
 getExecPath :: IO (Maybe String)
 #if defined(mingw32_HOST_OS)
-getExecPath =
-     allocaArray len $ \buf -> do
-         ret <- getModuleFileName nullPtr buf len
-         if ret == 0 then return Nothing
-                     else liftM Just $ peekCString buf
-    where len = 2048 -- Plenty, PATH_MAX is 512 under Win32.
-
-foreign import stdcall unsafe "GetModuleFileNameA"
-    getModuleFileName :: Ptr () -> CString -> Int -> IO Int32
+getExecPath = try_size 2048 -- plenty, PATH_MAX is 512 under Win32.
+  where
+    try_size size = allocaArray (fromIntegral size) $ \buf -> do
+        ret <- c_GetModuleFileName nullPtr buf size
+        case ret of
+          0 -> return Nothing
+          _ | ret < size -> fmap Just $ peekCWString buf
+            | otherwise  -> try_size (size * 2)
+
+foreign import stdcall unsafe "windows.h GetModuleFileNameW"
+  c_GetModuleFileName :: Ptr () -> CWString -> Word32 -> IO Word32
 #else
 getExecPath = return Nothing
 #endif