1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
14 #include "HsVersions.h"
16 import DATA_INT ( Int32 )
17 import DATA_BITS ( Bits((.&.)) )
18 import GLAEXTS ( Char#, Char(..) )
24 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
34 The predicates below look costly, but aren't, GHC+GCC do a great job
35 at the big case below.
38 {-# INLINE is_ctype #-}
39 is_ctype :: Int -> Char# -> Bool
40 is_ctype mask c = (fromIntegral (charType (C# c)) .&. fromIntegral mask) /= (0::Int32)
42 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit :: Char# -> Bool
43 is_ident = is_ctype cIdent
44 is_symbol = is_ctype cSymbol
45 is_any = is_ctype cAny
46 is_space = is_ctype cSpace
47 is_lower = is_ctype cLower
48 is_upper = is_ctype cUpper
49 is_digit = is_ctype cDigit
52 We really mean .|. instead of + below, but GHC currently doesn't do
53 any constant folding with bitops. *sigh*
56 charType :: Char -> Int
57 charType c = case c of
67 '\9' -> cAny + cSpace -- \t
68 '\10' -> cAny + cSpace -- \n
69 '\11' -> cAny + cSpace -- \v
70 '\12' -> cAny + cSpace -- \f
71 '\13' -> cAny + cSpace -- ^M
90 '\32' -> cAny + cSpace --
91 '\33' -> cAny + cSymbol -- !
93 '\35' -> cAny + cSymbol -- #
94 '\36' -> cAny + cSymbol -- $
95 '\37' -> cAny + cSymbol -- %
96 '\38' -> cAny + cSymbol -- &
97 '\39' -> cAny + cIdent -- '
100 '\42' -> cAny + cSymbol -- *
101 '\43' -> cAny + cSymbol -- +
103 '\45' -> cAny + cSymbol -- -
104 '\46' -> cAny + cSymbol -- .
105 '\47' -> cAny + cSymbol -- /
106 '\48' -> cAny + cIdent + cDigit -- 0
107 '\49' -> cAny + cIdent + cDigit -- 1
108 '\50' -> cAny + cIdent + cDigit -- 2
109 '\51' -> cAny + cIdent + cDigit -- 3
110 '\52' -> cAny + cIdent + cDigit -- 4
111 '\53' -> cAny + cIdent + cDigit -- 5
112 '\54' -> cAny + cIdent + cDigit -- 6
113 '\55' -> cAny + cIdent + cDigit -- 7
114 '\56' -> cAny + cIdent + cDigit -- 8
115 '\57' -> cAny + cIdent + cDigit -- 9
116 '\58' -> cAny + cSymbol -- :
118 '\60' -> cAny + cSymbol -- <
119 '\61' -> cAny + cSymbol -- =
120 '\62' -> cAny + cSymbol -- >
121 '\63' -> cAny + cSymbol -- ?
122 '\64' -> cAny + cSymbol -- @
123 '\65' -> cAny + cIdent + cUpper -- A
124 '\66' -> cAny + cIdent + cUpper -- B
125 '\67' -> cAny + cIdent + cUpper -- C
126 '\68' -> cAny + cIdent + cUpper -- D
127 '\69' -> cAny + cIdent + cUpper -- E
128 '\70' -> cAny + cIdent + cUpper -- F
129 '\71' -> cAny + cIdent + cUpper -- G
130 '\72' -> cAny + cIdent + cUpper -- H
131 '\73' -> cAny + cIdent + cUpper -- I
132 '\74' -> cAny + cIdent + cUpper -- J
133 '\75' -> cAny + cIdent + cUpper -- K
134 '\76' -> cAny + cIdent + cUpper -- L
135 '\77' -> cAny + cIdent + cUpper -- M
136 '\78' -> cAny + cIdent + cUpper -- N
137 '\79' -> cAny + cIdent + cUpper -- O
138 '\80' -> cAny + cIdent + cUpper -- P
139 '\81' -> cAny + cIdent + cUpper -- Q
140 '\82' -> cAny + cIdent + cUpper -- R
141 '\83' -> cAny + cIdent + cUpper -- S
142 '\84' -> cAny + cIdent + cUpper -- T
143 '\85' -> cAny + cIdent + cUpper -- U
144 '\86' -> cAny + cIdent + cUpper -- V
145 '\87' -> cAny + cIdent + cUpper -- W
146 '\88' -> cAny + cIdent + cUpper -- X
147 '\89' -> cAny + cIdent + cUpper -- Y
148 '\90' -> cAny + cIdent + cUpper -- Z
150 '\92' -> cAny + cSymbol -- backslash
152 '\94' -> cAny + cSymbol -- ^
153 '\95' -> cAny + cIdent + cLower -- _
155 '\97' -> cAny + cIdent + cLower -- a
156 '\98' -> cAny + cIdent + cLower -- b
157 '\99' -> cAny + cIdent + cLower -- c
158 '\100' -> cAny + cIdent + cLower -- d
159 '\101' -> cAny + cIdent + cLower -- e
160 '\102' -> cAny + cIdent + cLower -- f
161 '\103' -> cAny + cIdent + cLower -- g
162 '\104' -> cAny + cIdent + cLower -- h
163 '\105' -> cAny + cIdent + cLower -- i
164 '\106' -> cAny + cIdent + cLower -- j
165 '\107' -> cAny + cIdent + cLower -- k
166 '\108' -> cAny + cIdent + cLower -- l
167 '\109' -> cAny + cIdent + cLower -- m
168 '\110' -> cAny + cIdent + cLower -- n
169 '\111' -> cAny + cIdent + cLower -- o
170 '\112' -> cAny + cIdent + cLower -- p
171 '\113' -> cAny + cIdent + cLower -- q
172 '\114' -> cAny + cIdent + cLower -- r
173 '\115' -> cAny + cIdent + cLower -- s
174 '\116' -> cAny + cIdent + cLower -- t
175 '\117' -> cAny + cIdent + cLower -- u
176 '\118' -> cAny + cIdent + cLower -- v
177 '\119' -> cAny + cIdent + cLower -- w
178 '\120' -> cAny + cIdent + cLower -- x
179 '\121' -> cAny + cIdent + cLower -- y
180 '\122' -> cAny + cIdent + cLower -- z
182 '\124' -> cAny + cSymbol -- |
184 '\126' -> cAny + cSymbol -- ~
219 '\161' -> cAny + cSymbol -- ¡
220 '\162' -> cAny + cSymbol -- ¢
221 '\163' -> cAny + cSymbol -- £
222 '\164' -> cAny + cSymbol -- ¤
223 '\165' -> cAny + cSymbol -- ¥
224 '\166' -> cAny + cSymbol -- ¦
225 '\167' -> cAny + cSymbol -- §
226 '\168' -> cAny + cSymbol -- ¨
227 '\169' -> cAny + cSymbol -- ©
228 '\170' -> cAny + cSymbol -- ª
229 '\171' -> cAny + cSymbol -- «
230 '\172' -> cAny + cSymbol -- ¬
231 '\173' -> cAny + cSymbol --
232 '\174' -> cAny + cSymbol -- ®
233 '\175' -> cAny + cSymbol -- ¯
234 '\176' -> cAny + cSymbol -- °
235 '\177' -> cAny + cSymbol -- ±
236 '\178' -> cAny + cSymbol -- ²
237 '\179' -> cAny + cSymbol -- ³
238 '\180' -> cAny + cSymbol -- ´
239 '\181' -> cAny + cSymbol -- µ
240 '\182' -> cAny + cSymbol -- ¶
241 '\183' -> cAny + cSymbol -- ·
242 '\184' -> cAny + cSymbol -- ¸
243 '\185' -> cAny + cSymbol -- ¹
244 '\186' -> cAny + cSymbol -- º
245 '\187' -> cAny + cSymbol -- »
246 '\188' -> cAny + cSymbol -- ¼
247 '\189' -> cAny + cSymbol -- ½
248 '\190' -> cAny + cSymbol -- ¾
249 '\191' -> cAny + cSymbol -- ¿
250 '\192' -> cAny + cIdent + cUpper -- À
251 '\193' -> cAny + cIdent + cUpper -- Á
252 '\194' -> cAny + cIdent + cUpper -- Â
253 '\195' -> cAny + cIdent + cUpper -- Ã
254 '\196' -> cAny + cIdent + cUpper -- Ä
255 '\197' -> cAny + cIdent + cUpper -- Å
256 '\198' -> cAny + cIdent + cUpper -- Æ
257 '\199' -> cAny + cIdent + cUpper -- Ç
258 '\200' -> cAny + cIdent + cUpper -- È
259 '\201' -> cAny + cIdent + cUpper -- É
260 '\202' -> cAny + cIdent + cUpper -- Ê
261 '\203' -> cAny + cIdent + cUpper -- Ë
262 '\204' -> cAny + cIdent + cUpper -- Ì
263 '\205' -> cAny + cIdent + cUpper -- Í
264 '\206' -> cAny + cIdent + cUpper -- Î
265 '\207' -> cAny + cIdent + cUpper -- Ï
266 '\208' -> cAny + cIdent + cUpper -- Ð
267 '\209' -> cAny + cIdent + cUpper -- Ñ
268 '\210' -> cAny + cIdent + cUpper -- Ò
269 '\211' -> cAny + cIdent + cUpper -- Ó
270 '\212' -> cAny + cIdent + cUpper -- Ô
271 '\213' -> cAny + cIdent + cUpper -- Õ
272 '\214' -> cAny + cIdent + cUpper -- Ö
273 '\215' -> cAny + cSymbol + cLower -- ×
274 '\216' -> cAny + cIdent + cUpper -- Ø
275 '\217' -> cAny + cIdent + cUpper -- Ù
276 '\218' -> cAny + cIdent + cUpper -- Ú
277 '\219' -> cAny + cIdent + cUpper -- Û
278 '\220' -> cAny + cIdent + cUpper -- Ü
279 '\221' -> cAny + cIdent + cUpper -- Ý
280 '\222' -> cAny + cIdent + cUpper -- Þ
281 '\223' -> cAny + cIdent -- ß
282 '\224' -> cAny + cIdent + cLower -- à
283 '\225' -> cAny + cIdent + cLower -- á
284 '\226' -> cAny + cIdent + cLower -- â
285 '\227' -> cAny + cIdent + cLower -- ã
286 '\228' -> cAny + cIdent + cLower -- ä
287 '\229' -> cAny + cIdent + cLower -- å
288 '\230' -> cAny + cIdent + cLower -- æ
289 '\231' -> cAny + cIdent + cLower -- ç
290 '\232' -> cAny + cIdent + cLower -- è
291 '\233' -> cAny + cIdent + cLower -- é
292 '\234' -> cAny + cIdent + cLower -- ê
293 '\235' -> cAny + cIdent + cLower -- ë
294 '\236' -> cAny + cIdent + cLower -- ì
295 '\237' -> cAny + cIdent + cLower -- í
296 '\238' -> cAny + cIdent + cLower -- î
297 '\239' -> cAny + cIdent + cLower -- ï
298 '\240' -> cAny + cIdent + cLower -- ð
299 '\241' -> cAny + cIdent + cLower -- ñ
300 '\242' -> cAny + cIdent + cLower -- ò
301 '\243' -> cAny + cIdent + cLower -- ó
302 '\244' -> cAny + cIdent + cLower -- ô
303 '\245' -> cAny + cIdent + cLower -- õ
304 '\246' -> cAny + cIdent + cLower -- ö
305 '\247' -> cAny + cSymbol -- ÷
306 '\248' -> cAny + cIdent -- ø
307 '\249' -> cAny + cIdent + cLower -- ù
308 '\250' -> cAny + cIdent + cLower -- ú
309 '\251' -> cAny + cIdent + cLower -- û
310 '\252' -> cAny + cIdent + cLower -- ü
311 '\253' -> cAny + cIdent + cLower -- ý
312 '\254' -> cAny + cIdent + cLower -- þ
313 '\255' -> cAny + cIdent + cLower -- ÿ