1 {-# LANGUAGE BangPatterns #-}
3 module RegAlloc.Graph.TrivColorable (
9 #include "HsVersions.h"
22 -- trivColorable ---------------------------------------------------------------
24 -- trivColorable function for the graph coloring allocator
26 -- This gets hammered by scanGraph during register allocation,
27 -- so needs to be fairly efficient.
29 -- NOTE: This only works for arcitectures with just RcInteger and RcDouble
30 -- (which are disjoint) ie. x86, x86_64 and ppc
32 -- The number of allocatable regs is hard coded in here so we can do
33 -- a fast comparision in trivColorable.
35 -- It's ok if these numbers are _less_ than the actual number of free
36 -- regs, but they can't be more or the register conflict
39 -- If the graph doesn't color then the allocator will panic, but it won't
40 -- generate bad object code or anything nasty like that.
42 -- There is an allocatableRegsInClass :: RegClass -> Int, but doing
43 -- the unboxing is too slow for us here.
44 -- TODO: Is that still true? Could we use allocatableRegsInClass
45 -- without losing performance now?
47 -- Look at includes/stg/MachRegs.h to get the numbers.
51 -- Disjoint registers ----------------------------------------------------------
53 -- The definition has been unfolded into individual cases for speed.
54 -- Each architecture has a different register setup, so we use a
55 -- different regSqueeze function for each.
64 accSqueeze count maxCount squeeze ufm = acc count (eltsUFM ufm)
65 where acc count [] = count
66 acc count _ | count >=# maxCount = count
67 acc count (r:rs) = acc (count +# squeeze r) rs
72 Doing a nice fold over the UniqSet makes trivColorable use
73 32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs.
74 Therefore the UniqFM is made non-abstract and we use custom fold.
77 When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
78 representation any more. But it is imperative that the assSqueeze stops
79 the folding if the count gets greater or equal to maxCount. We thus convert
80 UniqFM to a (lazy) list, do the fold and stops if necessary, which was
81 the most efficient variant tried. Benchmark compiling 10-times SHA1.lhs follows.
82 (original = previous implementation, folding = fold of the whole UFM,
83 lazyFold = the current implementation,
84 hackFold = using internal representation of Data.IntMap)
86 original folding hackFold lazyFold
87 -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s
88 100.00% 96.44% 97.72% 97.12%
89 -fregs-graph 67.938s 74.875s 62.673s 64.679s
90 100.00% 110.21% 92.25% 95.20%
91 -fregs-iterative 89.761s 143.913s 81.075s 86.912s
92 100.00% 160.33% 90.32% 96.83%
93 -fnew-codegen 38.225s 37.142s 37.551s 37.119s
94 100.00% 97.17% 98.24% 97.11%
95 -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s
96 100.00% 99.70% 95.19% 94.65%
97 -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s
98 100.00% 166.23% 94.18% 100.95%
101 -- TODO: We shouldn't be using defaultTargetPlatform here.
102 -- We should be passing DynFlags in instead, and looking at
103 -- its targetPlatform.
106 :: (RegClass -> VirtualReg -> FastInt)
107 -> (RegClass -> RealReg -> FastInt)
108 -> Triv VirtualReg RegClass RealReg
110 trivColorable virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
111 | let !cALLOCATABLE_REGS_INTEGER
112 = iUnbox (case platformArch defaultTargetPlatform of
117 ArchPPC_64 -> panic "trivColorable ArchPPC_64"
118 ArchUnknown -> panic "trivColorable ArchUnknown")
119 , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_INTEGER
120 (virtualRegSqueeze RcInteger)
123 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_INTEGER
124 (realRegSqueeze RcInteger)
127 = count3 <# cALLOCATABLE_REGS_INTEGER
129 trivColorable virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
130 | let !cALLOCATABLE_REGS_FLOAT
131 = iUnbox (case platformArch defaultTargetPlatform of
136 ArchPPC_64 -> panic "trivColorable ArchPPC_64"
137 ArchUnknown -> panic "trivColorable ArchUnknown")
138 , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_FLOAT
139 (virtualRegSqueeze RcFloat)
142 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_FLOAT
143 (realRegSqueeze RcFloat)
146 = count3 <# cALLOCATABLE_REGS_FLOAT
148 trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
149 | let !cALLOCATABLE_REGS_DOUBLE
150 = iUnbox (case platformArch defaultTargetPlatform of
155 ArchPPC_64 -> panic "trivColorable ArchPPC_64"
156 ArchUnknown -> panic "trivColorable ArchUnknown")
157 , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_DOUBLE
158 (virtualRegSqueeze RcDouble)
161 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_DOUBLE
162 (realRegSqueeze RcDouble)
165 = count3 <# cALLOCATABLE_REGS_DOUBLE
167 trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
168 | let !cALLOCATABLE_REGS_SSE
169 = iUnbox (case platformArch defaultTargetPlatform of
174 ArchPPC_64 -> panic "trivColorable ArchPPC_64"
175 ArchUnknown -> panic "trivColorable ArchUnknown")
176 , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_SSE
177 (virtualRegSqueeze RcDoubleSSE)
180 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_SSE
181 (realRegSqueeze RcDoubleSSE)
184 = count3 <# cALLOCATABLE_REGS_SSE
187 -- Specification Code ----------------------------------------------------------
189 -- The trivColorable function for each particular architecture should
190 -- implement the following function, but faster.
194 trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
195 trivColorable classN conflicts exclusions
198 acc :: Reg -> (Int, Int) -> (Int, Int)
201 RcInteger -> (cd+1, cf)
202 RcFloat -> (cd, cf+1)
203 _ -> panic "Regs.trivColorable: reg class not handled"
205 tmp = foldUniqSet acc (0, 0) conflicts
206 (countInt, countFloat) = foldUniqSet acc tmp exclusions
208 squeese = worst countInt classN RcInteger
209 + worst countFloat classN RcFloat
211 in squeese < allocatableRegsInClass classN
213 -- | Worst case displacement
214 -- node N of classN has n neighbors of class C.
216 -- We currently only have RcInteger and RcDouble, which don't conflict at all.
217 -- This is a bit boring compared to what's in RegArchX86.
219 worst :: Int -> RegClass -> RegClass -> Int
220 worst n classN classC
224 RcInteger -> min n (allocatableRegsInClass RcInteger)
229 RcFloat -> min n (allocatableRegsInClass RcFloat)
232 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
233 -- i.e., these are the regs for which we are prepared to allow the
234 -- register allocator to attempt to map VRegs to.
235 allocatableRegs :: [RegNo]
237 = let isFree i = isFastTrue (freeReg i)
238 in filter isFree allMachRegNos
241 -- | The number of regs in each class.
242 -- We go via top level CAFs to ensure that we're not recomputing
243 -- the length of these lists each time the fn is called.
244 allocatableRegsInClass :: RegClass -> Int
245 allocatableRegsInClass cls
247 RcInteger -> allocatableRegsInteger
248 RcFloat -> allocatableRegsDouble
250 allocatableRegsInteger :: Int
251 allocatableRegsInteger
252 = length $ filter (\r -> regClass r == RcInteger)
253 $ map RealReg allocatableRegs
255 allocatableRegsFloat :: Int
257 = length $ filter (\r -> regClass r == RcFloat
258 $ map RealReg allocatableRegs