1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
12 , is_alphanum -- Char# -> Bool
14 , is_hexdigit, is_octdigit
15 , hexDigit, octDecDigit
18 #include "HsVersions.h"
20 import Data.Int ( Int32 )
21 import Data.Bits ( Bits((.&.)) )
22 import Data.Char ( ord, chr )
29 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
39 The predicates below look costly, but aren't, GHC+GCC do a great job
40 at the big case below.
43 {-# INLINE is_ctype #-}
44 is_ctype :: Int -> Char -> Bool
45 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
47 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
48 is_alphanum :: Char -> Bool
49 is_ident = is_ctype cIdent
50 is_symbol = is_ctype cSymbol
51 is_any = is_ctype cAny
52 is_space = is_ctype cSpace
53 is_lower = is_ctype cLower
54 is_upper = is_ctype cUpper
55 is_digit = is_ctype cDigit
56 is_alphanum = is_ctype (cLower+cUpper+cDigit)
62 hexDigit :: Char -> Int
63 hexDigit c | is_digit c = ord c - ord '0'
64 | otherwise = ord (to_lower c) - ord 'a' + 10
66 octDecDigit :: Char -> Int
67 octDecDigit c = ord c - ord '0'
69 is_hexdigit :: Char -> Bool
72 || (c >= 'a' && c <= 'f')
73 || (c >= 'A' && c <= 'F')
75 is_octdigit :: Char -> Bool
76 is_octdigit c = c >= '0' && c <= '7'
78 to_lower :: Char -> Char
80 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
84 We really mean .|. instead of + below, but GHC currently doesn't do
85 any constant folding with bitops. *sigh*
88 charType :: Char -> Int
89 charType c = case c of
99 '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
100 '\10' -> cSpace -- \n (ditto)
101 '\11' -> cSpace -- \v (ditto)
102 '\12' -> cSpace -- \f (ditto)
103 '\13' -> cSpace -- ^M (ditto)
122 '\32' -> cAny + cSpace --
123 '\33' -> cAny + cSymbol -- !
125 '\35' -> cAny + cSymbol -- #
126 '\36' -> cAny + cSymbol -- $
127 '\37' -> cAny + cSymbol -- %
128 '\38' -> cAny + cSymbol -- &
129 '\39' -> cAny + cIdent -- '
132 '\42' -> cAny + cSymbol -- *
133 '\43' -> cAny + cSymbol -- +
135 '\45' -> cAny + cSymbol -- -
136 '\46' -> cAny + cSymbol -- .
137 '\47' -> cAny + cSymbol -- /
138 '\48' -> cAny + cIdent + cDigit -- 0
139 '\49' -> cAny + cIdent + cDigit -- 1
140 '\50' -> cAny + cIdent + cDigit -- 2
141 '\51' -> cAny + cIdent + cDigit -- 3
142 '\52' -> cAny + cIdent + cDigit -- 4
143 '\53' -> cAny + cIdent + cDigit -- 5
144 '\54' -> cAny + cIdent + cDigit -- 6
145 '\55' -> cAny + cIdent + cDigit -- 7
146 '\56' -> cAny + cIdent + cDigit -- 8
147 '\57' -> cAny + cIdent + cDigit -- 9
148 '\58' -> cAny + cSymbol -- :
150 '\60' -> cAny + cSymbol -- <
151 '\61' -> cAny + cSymbol -- =
152 '\62' -> cAny + cSymbol -- >
153 '\63' -> cAny + cSymbol -- ?
154 '\64' -> cAny + cSymbol -- @
155 '\65' -> cAny + cIdent + cUpper -- A
156 '\66' -> cAny + cIdent + cUpper -- B
157 '\67' -> cAny + cIdent + cUpper -- C
158 '\68' -> cAny + cIdent + cUpper -- D
159 '\69' -> cAny + cIdent + cUpper -- E
160 '\70' -> cAny + cIdent + cUpper -- F
161 '\71' -> cAny + cIdent + cUpper -- G
162 '\72' -> cAny + cIdent + cUpper -- H
163 '\73' -> cAny + cIdent + cUpper -- I
164 '\74' -> cAny + cIdent + cUpper -- J
165 '\75' -> cAny + cIdent + cUpper -- K
166 '\76' -> cAny + cIdent + cUpper -- L
167 '\77' -> cAny + cIdent + cUpper -- M
168 '\78' -> cAny + cIdent + cUpper -- N
169 '\79' -> cAny + cIdent + cUpper -- O
170 '\80' -> cAny + cIdent + cUpper -- P
171 '\81' -> cAny + cIdent + cUpper -- Q
172 '\82' -> cAny + cIdent + cUpper -- R
173 '\83' -> cAny + cIdent + cUpper -- S
174 '\84' -> cAny + cIdent + cUpper -- T
175 '\85' -> cAny + cIdent + cUpper -- U
176 '\86' -> cAny + cIdent + cUpper -- V
177 '\87' -> cAny + cIdent + cUpper -- W
178 '\88' -> cAny + cIdent + cUpper -- X
179 '\89' -> cAny + cIdent + cUpper -- Y
180 '\90' -> cAny + cIdent + cUpper -- Z
182 '\92' -> cAny + cSymbol -- backslash
184 '\94' -> cAny + cSymbol -- ^
185 '\95' -> cAny + cIdent + cLower -- _
187 '\97' -> cAny + cIdent + cLower -- a
188 '\98' -> cAny + cIdent + cLower -- b
189 '\99' -> cAny + cIdent + cLower -- c
190 '\100' -> cAny + cIdent + cLower -- d
191 '\101' -> cAny + cIdent + cLower -- e
192 '\102' -> cAny + cIdent + cLower -- f
193 '\103' -> cAny + cIdent + cLower -- g
194 '\104' -> cAny + cIdent + cLower -- h
195 '\105' -> cAny + cIdent + cLower -- i
196 '\106' -> cAny + cIdent + cLower -- j
197 '\107' -> cAny + cIdent + cLower -- k
198 '\108' -> cAny + cIdent + cLower -- l
199 '\109' -> cAny + cIdent + cLower -- m
200 '\110' -> cAny + cIdent + cLower -- n
201 '\111' -> cAny + cIdent + cLower -- o
202 '\112' -> cAny + cIdent + cLower -- p
203 '\113' -> cAny + cIdent + cLower -- q
204 '\114' -> cAny + cIdent + cLower -- r
205 '\115' -> cAny + cIdent + cLower -- s
206 '\116' -> cAny + cIdent + cLower -- t
207 '\117' -> cAny + cIdent + cLower -- u
208 '\118' -> cAny + cIdent + cLower -- v
209 '\119' -> cAny + cIdent + cLower -- w
210 '\120' -> cAny + cIdent + cLower -- x
211 '\121' -> cAny + cIdent + cLower -- y
212 '\122' -> cAny + cIdent + cLower -- z
214 '\124' -> cAny + cSymbol -- |
216 '\126' -> cAny + cSymbol -- ~
251 '\161' -> cAny + cSymbol -- ¡
252 '\162' -> cAny + cSymbol -- ¢
253 '\163' -> cAny + cSymbol -- £
254 '\164' -> cAny + cSymbol -- ¤
255 '\165' -> cAny + cSymbol -- ¥
256 '\166' -> cAny + cSymbol -- ¦
257 '\167' -> cAny + cSymbol -- §
258 '\168' -> cAny + cSymbol -- ¨
259 '\169' -> cAny + cSymbol -- ©
260 '\170' -> cAny + cSymbol -- ª
261 '\171' -> cAny + cSymbol -- «
262 '\172' -> cAny + cSymbol -- ¬
263 '\173' -> cAny + cSymbol --
264 '\174' -> cAny + cSymbol -- ®
265 '\175' -> cAny + cSymbol -- ¯
266 '\176' -> cAny + cSymbol -- °
267 '\177' -> cAny + cSymbol -- ±
268 '\178' -> cAny + cSymbol -- ²
269 '\179' -> cAny + cSymbol -- ³
270 '\180' -> cAny + cSymbol -- ´
271 '\181' -> cAny + cSymbol -- µ
272 '\182' -> cAny + cSymbol -- ¶
273 '\183' -> cAny + cSymbol -- ·
274 '\184' -> cAny + cSymbol -- ¸
275 '\185' -> cAny + cSymbol -- ¹
276 '\186' -> cAny + cSymbol -- º
277 '\187' -> cAny + cSymbol -- »
278 '\188' -> cAny + cSymbol -- ¼
279 '\189' -> cAny + cSymbol -- ½
280 '\190' -> cAny + cSymbol -- ¾
281 '\191' -> cAny + cSymbol -- ¿
282 '\192' -> cAny + cIdent + cUpper -- À
283 '\193' -> cAny + cIdent + cUpper -- Á
284 '\194' -> cAny + cIdent + cUpper -- Â
285 '\195' -> cAny + cIdent + cUpper -- Ã
286 '\196' -> cAny + cIdent + cUpper -- Ä
287 '\197' -> cAny + cIdent + cUpper -- Å
288 '\198' -> cAny + cIdent + cUpper -- Æ
289 '\199' -> cAny + cIdent + cUpper -- Ç
290 '\200' -> cAny + cIdent + cUpper -- È
291 '\201' -> cAny + cIdent + cUpper -- É
292 '\202' -> cAny + cIdent + cUpper -- Ê
293 '\203' -> cAny + cIdent + cUpper -- Ë
294 '\204' -> cAny + cIdent + cUpper -- Ì
295 '\205' -> cAny + cIdent + cUpper -- Í
296 '\206' -> cAny + cIdent + cUpper -- Î
297 '\207' -> cAny + cIdent + cUpper -- Ï
298 '\208' -> cAny + cIdent + cUpper -- Ð
299 '\209' -> cAny + cIdent + cUpper -- Ñ
300 '\210' -> cAny + cIdent + cUpper -- Ò
301 '\211' -> cAny + cIdent + cUpper -- Ó
302 '\212' -> cAny + cIdent + cUpper -- Ô
303 '\213' -> cAny + cIdent + cUpper -- Õ
304 '\214' -> cAny + cIdent + cUpper -- Ö
305 '\215' -> cAny + cSymbol + cLower -- ×
306 '\216' -> cAny + cIdent + cUpper -- Ø
307 '\217' -> cAny + cIdent + cUpper -- Ù
308 '\218' -> cAny + cIdent + cUpper -- Ú
309 '\219' -> cAny + cIdent + cUpper -- Û
310 '\220' -> cAny + cIdent + cUpper -- Ü
311 '\221' -> cAny + cIdent + cUpper -- Ý
312 '\222' -> cAny + cIdent + cUpper -- Þ
313 '\223' -> cAny + cIdent -- ß
314 '\224' -> cAny + cIdent + cLower -- à
315 '\225' -> cAny + cIdent + cLower -- á
316 '\226' -> cAny + cIdent + cLower -- â
317 '\227' -> cAny + cIdent + cLower -- ã
318 '\228' -> cAny + cIdent + cLower -- ä
319 '\229' -> cAny + cIdent + cLower -- å
320 '\230' -> cAny + cIdent + cLower -- æ
321 '\231' -> cAny + cIdent + cLower -- ç
322 '\232' -> cAny + cIdent + cLower -- è
323 '\233' -> cAny + cIdent + cLower -- é
324 '\234' -> cAny + cIdent + cLower -- ê
325 '\235' -> cAny + cIdent + cLower -- ë
326 '\236' -> cAny + cIdent + cLower -- ì
327 '\237' -> cAny + cIdent + cLower -- í
328 '\238' -> cAny + cIdent + cLower -- î
329 '\239' -> cAny + cIdent + cLower -- ï
330 '\240' -> cAny + cIdent + cLower -- ð
331 '\241' -> cAny + cIdent + cLower -- ñ
332 '\242' -> cAny + cIdent + cLower -- ò
333 '\243' -> cAny + cIdent + cLower -- ó
334 '\244' -> cAny + cIdent + cLower -- ô
335 '\245' -> cAny + cIdent + cLower -- õ
336 '\246' -> cAny + cIdent + cLower -- ö
337 '\247' -> cAny + cSymbol -- ÷
338 '\248' -> cAny + cIdent -- ø
339 '\249' -> cAny + cIdent + cLower -- ù
340 '\250' -> cAny + cIdent + cLower -- ú
341 '\251' -> cAny + cIdent + cLower -- û
342 '\252' -> cAny + cIdent + cLower -- ü
343 '\253' -> cAny + cIdent + cLower -- ý
344 '\254' -> cAny + cIdent + cLower -- þ
345 '\255' -> cAny + cIdent + cLower -- ÿ
346 _ -> panic ("charType: " ++ show c)