1 {-# OPTIONS -fno-warn-unused-binds #-}
3 module RegAlloc.Graph.TrivColorable (
9 #include "HsVersions.h"
20 -- trivColorable ---------------------------------------------------------------
22 -- trivColorable function for the graph coloring allocator
24 -- This gets hammered by scanGraph during register allocation,
25 -- so needs to be fairly efficient.
27 -- NOTE: This only works for arcitectures with just RcInteger and RcDouble
28 -- (which are disjoint) ie. x86, x86_64 and ppc
30 -- The number of allocatable regs is hard coded here so we can do a fast
31 -- comparision in trivColorable.
33 -- It's ok if these numbers are _less_ than the actual number of free regs,
34 -- but they can't be more or the register conflict graph won't color.
36 -- If the graph doesn't color then the allocator will panic, but it won't
37 -- generate bad object code or anything nasty like that.
39 -- There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing
40 -- is too slow for us here.
42 -- Look at includes/stg/MachRegs.h to get these numbers.
46 #define ALLOCATABLE_REGS_INTEGER (_ILIT(3))
47 #define ALLOCATABLE_REGS_DOUBLE (_ILIT(6))
48 #define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
49 #define ALLOCATABLE_REGS_SSE (_ILIT(8))
52 #elif x86_64_TARGET_ARCH
53 #define ALLOCATABLE_REGS_INTEGER (_ILIT(5))
54 #define ALLOCATABLE_REGS_DOUBLE (_ILIT(0))
55 #define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
56 #define ALLOCATABLE_REGS_SSE (_ILIT(10))
58 #elif powerpc_TARGET_ARCH
59 #define ALLOCATABLE_REGS_INTEGER (_ILIT(16))
60 #define ALLOCATABLE_REGS_DOUBLE (_ILIT(26))
61 #define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
62 #define ALLOCATABLE_REGS_SSE (_ILIT(0))
65 #elif sparc_TARGET_ARCH
66 #define ALLOCATABLE_REGS_INTEGER (_ILIT(14))
67 #define ALLOCATABLE_REGS_DOUBLE (_ILIT(11))
68 #define ALLOCATABLE_REGS_FLOAT (_ILIT(22))
69 #define ALLOCATABLE_REGS_SSE (_ILIT(0))
73 #error ToDo: choose which trivColorable function to use for this architecture.
78 -- Disjoint registers ----------------------------------------------------------
80 -- The definition has been unfolded into individual cases for speed.
81 -- Each architecture has a different register setup, so we use a
82 -- different regSqueeze function for each.
91 accSqueeze count maxCount squeeze ufm = acc count (eltsUFM ufm)
92 where acc count [] = count
93 acc count _ | count >=# maxCount = count
94 acc count (r:rs) = acc (count +# squeeze r) rs
99 Doing a nice fold over the UniqSet makes trivColorable use
100 32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs.
101 Therefore the UniqFM is made non-abstract and we use custom fold.
104 When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
105 representation any more. But it is imperative that the assSqueeze stops
106 the folding if the count gets greater or equal to maxCount. We thus convert
107 UniqFM to a (lazy) list, do the fold and stops if necessary, which was
108 the most efficient variant tried. Benchmark compiling 10-times SHA1.lhs follows.
109 (original = previous implementation, folding = fold of the whole UFM,
110 lazyFold = the current implementation,
111 hackFold = using internal representation of Data.IntMap)
113 original folding hackFold lazyFold
114 -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s
115 100.00% 96.44% 97.72% 97.12%
116 -fregs-graph 67.938s 74.875s 62.673s 64.679s
117 100.00% 110.21% 92.25% 95.20%
118 -fregs-iterative 89.761s 143.913s 81.075s 86.912s
119 100.00% 160.33% 90.32% 96.83%
120 -fnew-codegen 38.225s 37.142s 37.551s 37.119s
121 100.00% 97.17% 98.24% 97.11%
122 -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s
123 100.00% 99.70% 95.19% 94.65%
124 -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s
125 100.00% 166.23% 94.18% 100.95%
129 :: (RegClass -> VirtualReg -> FastInt)
130 -> (RegClass -> RealReg -> FastInt)
131 -> Triv VirtualReg RegClass RealReg
133 trivColorable virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
134 | count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_INTEGER
135 (virtualRegSqueeze RcInteger)
138 , count3 <- accSqueeze count2 ALLOCATABLE_REGS_INTEGER
139 (realRegSqueeze RcInteger)
142 = count3 <# ALLOCATABLE_REGS_INTEGER
144 trivColorable virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
145 | count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_FLOAT
146 (virtualRegSqueeze RcFloat)
149 , count3 <- accSqueeze count2 ALLOCATABLE_REGS_FLOAT
150 (realRegSqueeze RcFloat)
153 = count3 <# ALLOCATABLE_REGS_FLOAT
155 trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
156 | count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_DOUBLE
157 (virtualRegSqueeze RcDouble)
160 , count3 <- accSqueeze count2 ALLOCATABLE_REGS_DOUBLE
161 (realRegSqueeze RcDouble)
164 = count3 <# ALLOCATABLE_REGS_DOUBLE
166 trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
167 | count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_SSE
168 (virtualRegSqueeze RcDoubleSSE)
171 , count3 <- accSqueeze count2 ALLOCATABLE_REGS_SSE
172 (realRegSqueeze RcDoubleSSE)
175 = count3 <# ALLOCATABLE_REGS_SSE
178 -- Specification Code ----------------------------------------------------------
180 -- The trivColorable function for each particular architecture should
181 -- implement the following function, but faster.
185 trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
186 trivColorable classN conflicts exclusions
189 acc :: Reg -> (Int, Int) -> (Int, Int)
192 RcInteger -> (cd+1, cf)
193 RcFloat -> (cd, cf+1)
194 _ -> panic "Regs.trivColorable: reg class not handled"
196 tmp = foldUniqSet acc (0, 0) conflicts
197 (countInt, countFloat) = foldUniqSet acc tmp exclusions
199 squeese = worst countInt classN RcInteger
200 + worst countFloat classN RcFloat
202 in squeese < allocatableRegsInClass classN
204 -- | Worst case displacement
205 -- node N of classN has n neighbors of class C.
207 -- We currently only have RcInteger and RcDouble, which don't conflict at all.
208 -- This is a bit boring compared to what's in RegArchX86.
210 worst :: Int -> RegClass -> RegClass -> Int
211 worst n classN classC
215 RcInteger -> min n (allocatableRegsInClass RcInteger)
220 RcFloat -> min n (allocatableRegsInClass RcFloat)
223 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
224 -- i.e., these are the regs for which we are prepared to allow the
225 -- register allocator to attempt to map VRegs to.
226 allocatableRegs :: [RegNo]
228 = let isFree i = isFastTrue (freeReg i)
229 in filter isFree allMachRegNos
232 -- | The number of regs in each class.
233 -- We go via top level CAFs to ensure that we're not recomputing
234 -- the length of these lists each time the fn is called.
235 allocatableRegsInClass :: RegClass -> Int
236 allocatableRegsInClass cls
238 RcInteger -> allocatableRegsInteger
239 RcFloat -> allocatableRegsDouble
241 allocatableRegsInteger :: Int
242 allocatableRegsInteger
243 = length $ filter (\r -> regClass r == RcInteger)
244 $ map RealReg allocatableRegs
246 allocatableRegsFloat :: Int
248 = length $ filter (\r -> regClass r == RcFloat
249 $ map RealReg allocatableRegs