1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
12 , is_alphanum -- Char# -> Bool
14 , is_decdigit, is_hexdigit, is_octdigit
15 , hexDigit, octDecDigit
18 #include "HsVersions.h"
20 import Data.Int ( Int32 )
21 import Data.Bits ( Bits((.&.)) )
22 import Data.Char ( ord, chr )
29 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
39 The predicates below look costly, but aren't, GHC+GCC do a great job
40 at the big case below.
43 {-# INLINE is_ctype #-}
44 is_ctype :: Int -> Char -> Bool
45 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
47 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
48 is_alphanum :: Char -> Bool
49 is_ident = is_ctype cIdent
50 is_symbol = is_ctype cSymbol
51 is_any = is_ctype cAny
52 is_space = is_ctype cSpace
53 is_lower = is_ctype cLower
54 is_upper = is_ctype cUpper
55 is_digit = is_ctype cDigit
56 is_alphanum = is_ctype (cLower+cUpper+cDigit)
62 hexDigit :: Char -> Int
63 hexDigit c | is_decdigit c = ord c - ord '0'
64 | otherwise = ord (to_lower c) - ord 'a' + 10
66 octDecDigit :: Char -> Int
67 octDecDigit c = ord c - ord '0'
69 is_decdigit :: Char -> Bool
71 = c >= '0' && c <= '9'
73 is_hexdigit :: Char -> Bool
76 || (c >= 'a' && c <= 'f')
77 || (c >= 'A' && c <= 'F')
79 is_octdigit :: Char -> Bool
80 is_octdigit c = c >= '0' && c <= '7'
82 to_lower :: Char -> Char
84 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
88 We really mean .|. instead of + below, but GHC currently doesn't do
89 any constant folding with bitops. *sigh*
92 charType :: Char -> Int
93 charType c = case c of
103 '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
104 '\10' -> cSpace -- \n (ditto)
105 '\11' -> cSpace -- \v (ditto)
106 '\12' -> cSpace -- \f (ditto)
107 '\13' -> cSpace -- ^M (ditto)
126 '\32' -> cAny + cSpace --
127 '\33' -> cAny + cSymbol -- !
129 '\35' -> cAny + cSymbol -- #
130 '\36' -> cAny + cSymbol -- $
131 '\37' -> cAny + cSymbol -- %
132 '\38' -> cAny + cSymbol -- &
133 '\39' -> cAny + cIdent -- '
136 '\42' -> cAny + cSymbol -- *
137 '\43' -> cAny + cSymbol -- +
139 '\45' -> cAny + cSymbol -- -
140 '\46' -> cAny + cSymbol -- .
141 '\47' -> cAny + cSymbol -- /
142 '\48' -> cAny + cIdent + cDigit -- 0
143 '\49' -> cAny + cIdent + cDigit -- 1
144 '\50' -> cAny + cIdent + cDigit -- 2
145 '\51' -> cAny + cIdent + cDigit -- 3
146 '\52' -> cAny + cIdent + cDigit -- 4
147 '\53' -> cAny + cIdent + cDigit -- 5
148 '\54' -> cAny + cIdent + cDigit -- 6
149 '\55' -> cAny + cIdent + cDigit -- 7
150 '\56' -> cAny + cIdent + cDigit -- 8
151 '\57' -> cAny + cIdent + cDigit -- 9
152 '\58' -> cAny + cSymbol -- :
154 '\60' -> cAny + cSymbol -- <
155 '\61' -> cAny + cSymbol -- =
156 '\62' -> cAny + cSymbol -- >
157 '\63' -> cAny + cSymbol -- ?
158 '\64' -> cAny + cSymbol -- @
159 '\65' -> cAny + cIdent + cUpper -- A
160 '\66' -> cAny + cIdent + cUpper -- B
161 '\67' -> cAny + cIdent + cUpper -- C
162 '\68' -> cAny + cIdent + cUpper -- D
163 '\69' -> cAny + cIdent + cUpper -- E
164 '\70' -> cAny + cIdent + cUpper -- F
165 '\71' -> cAny + cIdent + cUpper -- G
166 '\72' -> cAny + cIdent + cUpper -- H
167 '\73' -> cAny + cIdent + cUpper -- I
168 '\74' -> cAny + cIdent + cUpper -- J
169 '\75' -> cAny + cIdent + cUpper -- K
170 '\76' -> cAny + cIdent + cUpper -- L
171 '\77' -> cAny + cIdent + cUpper -- M
172 '\78' -> cAny + cIdent + cUpper -- N
173 '\79' -> cAny + cIdent + cUpper -- O
174 '\80' -> cAny + cIdent + cUpper -- P
175 '\81' -> cAny + cIdent + cUpper -- Q
176 '\82' -> cAny + cIdent + cUpper -- R
177 '\83' -> cAny + cIdent + cUpper -- S
178 '\84' -> cAny + cIdent + cUpper -- T
179 '\85' -> cAny + cIdent + cUpper -- U
180 '\86' -> cAny + cIdent + cUpper -- V
181 '\87' -> cAny + cIdent + cUpper -- W
182 '\88' -> cAny + cIdent + cUpper -- X
183 '\89' -> cAny + cIdent + cUpper -- Y
184 '\90' -> cAny + cIdent + cUpper -- Z
186 '\92' -> cAny + cSymbol -- backslash
188 '\94' -> cAny + cSymbol -- ^
189 '\95' -> cAny + cIdent + cLower -- _
191 '\97' -> cAny + cIdent + cLower -- a
192 '\98' -> cAny + cIdent + cLower -- b
193 '\99' -> cAny + cIdent + cLower -- c
194 '\100' -> cAny + cIdent + cLower -- d
195 '\101' -> cAny + cIdent + cLower -- e
196 '\102' -> cAny + cIdent + cLower -- f
197 '\103' -> cAny + cIdent + cLower -- g
198 '\104' -> cAny + cIdent + cLower -- h
199 '\105' -> cAny + cIdent + cLower -- i
200 '\106' -> cAny + cIdent + cLower -- j
201 '\107' -> cAny + cIdent + cLower -- k
202 '\108' -> cAny + cIdent + cLower -- l
203 '\109' -> cAny + cIdent + cLower -- m
204 '\110' -> cAny + cIdent + cLower -- n
205 '\111' -> cAny + cIdent + cLower -- o
206 '\112' -> cAny + cIdent + cLower -- p
207 '\113' -> cAny + cIdent + cLower -- q
208 '\114' -> cAny + cIdent + cLower -- r
209 '\115' -> cAny + cIdent + cLower -- s
210 '\116' -> cAny + cIdent + cLower -- t
211 '\117' -> cAny + cIdent + cLower -- u
212 '\118' -> cAny + cIdent + cLower -- v
213 '\119' -> cAny + cIdent + cLower -- w
214 '\120' -> cAny + cIdent + cLower -- x
215 '\121' -> cAny + cIdent + cLower -- y
216 '\122' -> cAny + cIdent + cLower -- z
218 '\124' -> cAny + cSymbol -- |
220 '\126' -> cAny + cSymbol -- ~
255 '\161' -> cAny + cSymbol -- ¡
256 '\162' -> cAny + cSymbol -- ¢
257 '\163' -> cAny + cSymbol -- £
258 '\164' -> cAny + cSymbol -- ¤
259 '\165' -> cAny + cSymbol -- ¥
260 '\166' -> cAny + cSymbol -- ¦
261 '\167' -> cAny + cSymbol -- §
262 '\168' -> cAny + cSymbol -- ¨
263 '\169' -> cAny + cSymbol -- ©
264 '\170' -> cAny + cSymbol -- ª
265 '\171' -> cAny + cSymbol -- «
266 '\172' -> cAny + cSymbol -- ¬
267 '\173' -> cAny + cSymbol --
268 '\174' -> cAny + cSymbol -- ®
269 '\175' -> cAny + cSymbol -- ¯
270 '\176' -> cAny + cSymbol -- °
271 '\177' -> cAny + cSymbol -- ±
272 '\178' -> cAny + cSymbol -- ²
273 '\179' -> cAny + cSymbol -- ³
274 '\180' -> cAny + cSymbol -- ´
275 '\181' -> cAny + cSymbol -- µ
276 '\182' -> cAny + cSymbol -- ¶
277 '\183' -> cAny + cSymbol -- ·
278 '\184' -> cAny + cSymbol -- ¸
279 '\185' -> cAny + cSymbol -- ¹
280 '\186' -> cAny + cSymbol -- º
281 '\187' -> cAny + cSymbol -- »
282 '\188' -> cAny + cSymbol -- ¼
283 '\189' -> cAny + cSymbol -- ½
284 '\190' -> cAny + cSymbol -- ¾
285 '\191' -> cAny + cSymbol -- ¿
286 '\192' -> cAny + cIdent + cUpper -- À
287 '\193' -> cAny + cIdent + cUpper -- Á
288 '\194' -> cAny + cIdent + cUpper -- Â
289 '\195' -> cAny + cIdent + cUpper -- Ã
290 '\196' -> cAny + cIdent + cUpper -- Ä
291 '\197' -> cAny + cIdent + cUpper -- Å
292 '\198' -> cAny + cIdent + cUpper -- Æ
293 '\199' -> cAny + cIdent + cUpper -- Ç
294 '\200' -> cAny + cIdent + cUpper -- È
295 '\201' -> cAny + cIdent + cUpper -- É
296 '\202' -> cAny + cIdent + cUpper -- Ê
297 '\203' -> cAny + cIdent + cUpper -- Ë
298 '\204' -> cAny + cIdent + cUpper -- Ì
299 '\205' -> cAny + cIdent + cUpper -- Í
300 '\206' -> cAny + cIdent + cUpper -- Î
301 '\207' -> cAny + cIdent + cUpper -- Ï
302 '\208' -> cAny + cIdent + cUpper -- Ð
303 '\209' -> cAny + cIdent + cUpper -- Ñ
304 '\210' -> cAny + cIdent + cUpper -- Ò
305 '\211' -> cAny + cIdent + cUpper -- Ó
306 '\212' -> cAny + cIdent + cUpper -- Ô
307 '\213' -> cAny + cIdent + cUpper -- Õ
308 '\214' -> cAny + cIdent + cUpper -- Ö
309 '\215' -> cAny + cSymbol + cLower -- ×
310 '\216' -> cAny + cIdent + cUpper -- Ø
311 '\217' -> cAny + cIdent + cUpper -- Ù
312 '\218' -> cAny + cIdent + cUpper -- Ú
313 '\219' -> cAny + cIdent + cUpper -- Û
314 '\220' -> cAny + cIdent + cUpper -- Ü
315 '\221' -> cAny + cIdent + cUpper -- Ý
316 '\222' -> cAny + cIdent + cUpper -- Þ
317 '\223' -> cAny + cIdent -- ß
318 '\224' -> cAny + cIdent + cLower -- à
319 '\225' -> cAny + cIdent + cLower -- á
320 '\226' -> cAny + cIdent + cLower -- â
321 '\227' -> cAny + cIdent + cLower -- ã
322 '\228' -> cAny + cIdent + cLower -- ä
323 '\229' -> cAny + cIdent + cLower -- å
324 '\230' -> cAny + cIdent + cLower -- æ
325 '\231' -> cAny + cIdent + cLower -- ç
326 '\232' -> cAny + cIdent + cLower -- è
327 '\233' -> cAny + cIdent + cLower -- é
328 '\234' -> cAny + cIdent + cLower -- ê
329 '\235' -> cAny + cIdent + cLower -- ë
330 '\236' -> cAny + cIdent + cLower -- ì
331 '\237' -> cAny + cIdent + cLower -- í
332 '\238' -> cAny + cIdent + cLower -- î
333 '\239' -> cAny + cIdent + cLower -- ï
334 '\240' -> cAny + cIdent + cLower -- ð
335 '\241' -> cAny + cIdent + cLower -- ñ
336 '\242' -> cAny + cIdent + cLower -- ò
337 '\243' -> cAny + cIdent + cLower -- ó
338 '\244' -> cAny + cIdent + cLower -- ô
339 '\245' -> cAny + cIdent + cLower -- õ
340 '\246' -> cAny + cIdent + cLower -- ö
341 '\247' -> cAny + cSymbol -- ÷
342 '\248' -> cAny + cIdent -- ø
343 '\249' -> cAny + cIdent + cLower -- ù
344 '\250' -> cAny + cIdent + cLower -- ú
345 '\251' -> cAny + cIdent + cLower -- û
346 '\252' -> cAny + cIdent + cLower -- ü
347 '\253' -> cAny + cIdent + cLower -- ý
348 '\254' -> cAny + cIdent + cLower -- þ
349 '\255' -> cAny + cIdent + cLower -- ÿ
350 _ -> panic ("charType: " ++ show c)