1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
12 , is_alphanum -- Char# -> Bool
14 , is_hexdigit, is_octdigit
15 , hexDigit, octDecDigit
18 #include "HsVersions.h"
20 import DATA_INT ( Int32 )
21 import DATA_BITS ( Bits((.&.)) )
22 import Char ( ord, chr )
28 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
38 The predicates below look costly, but aren't, GHC+GCC do a great job
39 at the big case below.
42 {-# INLINE is_ctype #-}
43 is_ctype :: Int -> Char -> Bool
44 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
46 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit :: Char -> Bool
47 is_ident = is_ctype cIdent
48 is_symbol = is_ctype cSymbol
49 is_any = is_ctype cAny
50 is_space = is_ctype cSpace
51 is_lower = is_ctype cLower
52 is_upper = is_ctype cUpper
53 is_digit = is_ctype cDigit
54 is_alphanum = is_ctype (cLower+cUpper+cDigit)
60 hexDigit :: Char -> Int
61 hexDigit c | is_digit c = ord c - ord '0'
62 | otherwise = ord (to_lower c) - ord 'a' + 10
64 octDecDigit :: Char -> Int
65 octDecDigit c = ord c - ord '0'
69 || (c >= 'a' && c <= 'f')
70 || (c >= 'A' && c <= 'F')
72 is_octdigit c = c >= '0' && c <= '7'
75 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
79 We really mean .|. instead of + below, but GHC currently doesn't do
80 any constant folding with bitops. *sigh*
83 charType :: Char -> Int
84 charType c = case c of
94 '\9' -> cAny + cSpace -- \t
95 '\10' -> cSpace -- \n (not allowed in strings, so !cAny)
96 '\11' -> cAny + cSpace -- \v
97 '\12' -> cAny + cSpace -- \f
98 '\13' -> cAny + cSpace -- ^M
117 '\32' -> cAny + cSpace --
118 '\33' -> cAny + cSymbol -- !
120 '\35' -> cAny + cSymbol -- #
121 '\36' -> cAny + cSymbol -- $
122 '\37' -> cAny + cSymbol -- %
123 '\38' -> cAny + cSymbol -- &
124 '\39' -> cAny + cIdent -- '
127 '\42' -> cAny + cSymbol -- *
128 '\43' -> cAny + cSymbol -- +
130 '\45' -> cAny + cSymbol -- -
131 '\46' -> cAny + cSymbol -- .
132 '\47' -> cAny + cSymbol -- /
133 '\48' -> cAny + cIdent + cDigit -- 0
134 '\49' -> cAny + cIdent + cDigit -- 1
135 '\50' -> cAny + cIdent + cDigit -- 2
136 '\51' -> cAny + cIdent + cDigit -- 3
137 '\52' -> cAny + cIdent + cDigit -- 4
138 '\53' -> cAny + cIdent + cDigit -- 5
139 '\54' -> cAny + cIdent + cDigit -- 6
140 '\55' -> cAny + cIdent + cDigit -- 7
141 '\56' -> cAny + cIdent + cDigit -- 8
142 '\57' -> cAny + cIdent + cDigit -- 9
143 '\58' -> cAny + cSymbol -- :
145 '\60' -> cAny + cSymbol -- <
146 '\61' -> cAny + cSymbol -- =
147 '\62' -> cAny + cSymbol -- >
148 '\63' -> cAny + cSymbol -- ?
149 '\64' -> cAny + cSymbol -- @
150 '\65' -> cAny + cIdent + cUpper -- A
151 '\66' -> cAny + cIdent + cUpper -- B
152 '\67' -> cAny + cIdent + cUpper -- C
153 '\68' -> cAny + cIdent + cUpper -- D
154 '\69' -> cAny + cIdent + cUpper -- E
155 '\70' -> cAny + cIdent + cUpper -- F
156 '\71' -> cAny + cIdent + cUpper -- G
157 '\72' -> cAny + cIdent + cUpper -- H
158 '\73' -> cAny + cIdent + cUpper -- I
159 '\74' -> cAny + cIdent + cUpper -- J
160 '\75' -> cAny + cIdent + cUpper -- K
161 '\76' -> cAny + cIdent + cUpper -- L
162 '\77' -> cAny + cIdent + cUpper -- M
163 '\78' -> cAny + cIdent + cUpper -- N
164 '\79' -> cAny + cIdent + cUpper -- O
165 '\80' -> cAny + cIdent + cUpper -- P
166 '\81' -> cAny + cIdent + cUpper -- Q
167 '\82' -> cAny + cIdent + cUpper -- R
168 '\83' -> cAny + cIdent + cUpper -- S
169 '\84' -> cAny + cIdent + cUpper -- T
170 '\85' -> cAny + cIdent + cUpper -- U
171 '\86' -> cAny + cIdent + cUpper -- V
172 '\87' -> cAny + cIdent + cUpper -- W
173 '\88' -> cAny + cIdent + cUpper -- X
174 '\89' -> cAny + cIdent + cUpper -- Y
175 '\90' -> cAny + cIdent + cUpper -- Z
177 '\92' -> cAny + cSymbol -- backslash
179 '\94' -> cAny + cSymbol -- ^
180 '\95' -> cAny + cIdent + cLower -- _
182 '\97' -> cAny + cIdent + cLower -- a
183 '\98' -> cAny + cIdent + cLower -- b
184 '\99' -> cAny + cIdent + cLower -- c
185 '\100' -> cAny + cIdent + cLower -- d
186 '\101' -> cAny + cIdent + cLower -- e
187 '\102' -> cAny + cIdent + cLower -- f
188 '\103' -> cAny + cIdent + cLower -- g
189 '\104' -> cAny + cIdent + cLower -- h
190 '\105' -> cAny + cIdent + cLower -- i
191 '\106' -> cAny + cIdent + cLower -- j
192 '\107' -> cAny + cIdent + cLower -- k
193 '\108' -> cAny + cIdent + cLower -- l
194 '\109' -> cAny + cIdent + cLower -- m
195 '\110' -> cAny + cIdent + cLower -- n
196 '\111' -> cAny + cIdent + cLower -- o
197 '\112' -> cAny + cIdent + cLower -- p
198 '\113' -> cAny + cIdent + cLower -- q
199 '\114' -> cAny + cIdent + cLower -- r
200 '\115' -> cAny + cIdent + cLower -- s
201 '\116' -> cAny + cIdent + cLower -- t
202 '\117' -> cAny + cIdent + cLower -- u
203 '\118' -> cAny + cIdent + cLower -- v
204 '\119' -> cAny + cIdent + cLower -- w
205 '\120' -> cAny + cIdent + cLower -- x
206 '\121' -> cAny + cIdent + cLower -- y
207 '\122' -> cAny + cIdent + cLower -- z
209 '\124' -> cAny + cSymbol -- |
211 '\126' -> cAny + cSymbol -- ~
246 '\161' -> cAny + cSymbol -- ¡
247 '\162' -> cAny + cSymbol -- ¢
248 '\163' -> cAny + cSymbol -- £
249 '\164' -> cAny + cSymbol -- ¤
250 '\165' -> cAny + cSymbol -- ¥
251 '\166' -> cAny + cSymbol -- ¦
252 '\167' -> cAny + cSymbol -- §
253 '\168' -> cAny + cSymbol -- ¨
254 '\169' -> cAny + cSymbol -- ©
255 '\170' -> cAny + cSymbol -- ª
256 '\171' -> cAny + cSymbol -- «
257 '\172' -> cAny + cSymbol -- ¬
258 '\173' -> cAny + cSymbol --
259 '\174' -> cAny + cSymbol -- ®
260 '\175' -> cAny + cSymbol -- ¯
261 '\176' -> cAny + cSymbol -- °
262 '\177' -> cAny + cSymbol -- ±
263 '\178' -> cAny + cSymbol -- ²
264 '\179' -> cAny + cSymbol -- ³
265 '\180' -> cAny + cSymbol -- ´
266 '\181' -> cAny + cSymbol -- µ
267 '\182' -> cAny + cSymbol -- ¶
268 '\183' -> cAny + cSymbol -- ·
269 '\184' -> cAny + cSymbol -- ¸
270 '\185' -> cAny + cSymbol -- ¹
271 '\186' -> cAny + cSymbol -- º
272 '\187' -> cAny + cSymbol -- »
273 '\188' -> cAny + cSymbol -- ¼
274 '\189' -> cAny + cSymbol -- ½
275 '\190' -> cAny + cSymbol -- ¾
276 '\191' -> cAny + cSymbol -- ¿
277 '\192' -> cAny + cIdent + cUpper -- À
278 '\193' -> cAny + cIdent + cUpper -- Á
279 '\194' -> cAny + cIdent + cUpper -- Â
280 '\195' -> cAny + cIdent + cUpper -- Ã
281 '\196' -> cAny + cIdent + cUpper -- Ä
282 '\197' -> cAny + cIdent + cUpper -- Å
283 '\198' -> cAny + cIdent + cUpper -- Æ
284 '\199' -> cAny + cIdent + cUpper -- Ç
285 '\200' -> cAny + cIdent + cUpper -- È
286 '\201' -> cAny + cIdent + cUpper -- É
287 '\202' -> cAny + cIdent + cUpper -- Ê
288 '\203' -> cAny + cIdent + cUpper -- Ë
289 '\204' -> cAny + cIdent + cUpper -- Ì
290 '\205' -> cAny + cIdent + cUpper -- Í
291 '\206' -> cAny + cIdent + cUpper -- Î
292 '\207' -> cAny + cIdent + cUpper -- Ï
293 '\208' -> cAny + cIdent + cUpper -- Ð
294 '\209' -> cAny + cIdent + cUpper -- Ñ
295 '\210' -> cAny + cIdent + cUpper -- Ò
296 '\211' -> cAny + cIdent + cUpper -- Ó
297 '\212' -> cAny + cIdent + cUpper -- Ô
298 '\213' -> cAny + cIdent + cUpper -- Õ
299 '\214' -> cAny + cIdent + cUpper -- Ö
300 '\215' -> cAny + cSymbol + cLower -- ×
301 '\216' -> cAny + cIdent + cUpper -- Ø
302 '\217' -> cAny + cIdent + cUpper -- Ù
303 '\218' -> cAny + cIdent + cUpper -- Ú
304 '\219' -> cAny + cIdent + cUpper -- Û
305 '\220' -> cAny + cIdent + cUpper -- Ü
306 '\221' -> cAny + cIdent + cUpper -- Ý
307 '\222' -> cAny + cIdent + cUpper -- Þ
308 '\223' -> cAny + cIdent -- ß
309 '\224' -> cAny + cIdent + cLower -- à
310 '\225' -> cAny + cIdent + cLower -- á
311 '\226' -> cAny + cIdent + cLower -- â
312 '\227' -> cAny + cIdent + cLower -- ã
313 '\228' -> cAny + cIdent + cLower -- ä
314 '\229' -> cAny + cIdent + cLower -- å
315 '\230' -> cAny + cIdent + cLower -- æ
316 '\231' -> cAny + cIdent + cLower -- ç
317 '\232' -> cAny + cIdent + cLower -- è
318 '\233' -> cAny + cIdent + cLower -- é
319 '\234' -> cAny + cIdent + cLower -- ê
320 '\235' -> cAny + cIdent + cLower -- ë
321 '\236' -> cAny + cIdent + cLower -- ì
322 '\237' -> cAny + cIdent + cLower -- í
323 '\238' -> cAny + cIdent + cLower -- î
324 '\239' -> cAny + cIdent + cLower -- ï
325 '\240' -> cAny + cIdent + cLower -- ð
326 '\241' -> cAny + cIdent + cLower -- ñ
327 '\242' -> cAny + cIdent + cLower -- ò
328 '\243' -> cAny + cIdent + cLower -- ó
329 '\244' -> cAny + cIdent + cLower -- ô
330 '\245' -> cAny + cIdent + cLower -- õ
331 '\246' -> cAny + cIdent + cLower -- ö
332 '\247' -> cAny + cSymbol -- ÷
333 '\248' -> cAny + cIdent -- ø
334 '\249' -> cAny + cIdent + cLower -- ù
335 '\250' -> cAny + cIdent + cLower -- ú
336 '\251' -> cAny + cIdent + cLower -- û
337 '\252' -> cAny + cIdent + cLower -- ü
338 '\253' -> cAny + cIdent + cLower -- ý
339 '\254' -> cAny + cIdent + cLower -- þ
340 '\255' -> cAny + cIdent + cLower -- ÿ