1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
13 , is_hexdigit, is_octdigit
14 , hexDigit, octDecDigit
17 #include "HsVersions.h"
19 import DATA_INT ( Int32 )
20 import DATA_BITS ( Bits((.&.)) )
21 import Char ( ord, chr )
27 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
37 The predicates below look costly, but aren't, GHC+GCC do a great job
38 at the big case below.
41 {-# INLINE is_ctype #-}
42 is_ctype :: Int -> Char -> Bool
43 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
45 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit :: Char -> Bool
46 is_ident = is_ctype cIdent
47 is_symbol = is_ctype cSymbol
48 is_any = is_ctype cAny
49 is_space = is_ctype cSpace
50 is_lower = is_ctype cLower
51 is_upper = is_ctype cUpper
52 is_digit = is_ctype cDigit
58 hexDigit :: Char -> Int
59 hexDigit c | is_digit c = ord c - ord '0'
60 | otherwise = ord (to_lower c) - ord 'a' + 10
62 octDecDigit :: Char -> Int
63 octDecDigit c = ord c - ord '0'
67 || (c >= 'a' && c <= 'f')
68 || (c >= 'A' && c <= 'F')
70 is_octdigit c = c >= '0' && c <= '7'
73 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
77 We really mean .|. instead of + below, but GHC currently doesn't do
78 any constant folding with bitops. *sigh*
81 charType :: Char -> Int
82 charType c = case c of
92 '\9' -> cAny + cSpace -- \t
93 '\10' -> cSpace -- \n (not allowed in strings, so !cAny)
94 '\11' -> cAny + cSpace -- \v
95 '\12' -> cAny + cSpace -- \f
96 '\13' -> cAny + cSpace -- ^M
115 '\32' -> cAny + cSpace --
116 '\33' -> cAny + cSymbol -- !
118 '\35' -> cAny + cSymbol -- #
119 '\36' -> cAny + cSymbol -- $
120 '\37' -> cAny + cSymbol -- %
121 '\38' -> cAny + cSymbol -- &
122 '\39' -> cAny + cIdent -- '
125 '\42' -> cAny + cSymbol -- *
126 '\43' -> cAny + cSymbol -- +
128 '\45' -> cAny + cSymbol -- -
129 '\46' -> cAny + cSymbol -- .
130 '\47' -> cAny + cSymbol -- /
131 '\48' -> cAny + cIdent + cDigit -- 0
132 '\49' -> cAny + cIdent + cDigit -- 1
133 '\50' -> cAny + cIdent + cDigit -- 2
134 '\51' -> cAny + cIdent + cDigit -- 3
135 '\52' -> cAny + cIdent + cDigit -- 4
136 '\53' -> cAny + cIdent + cDigit -- 5
137 '\54' -> cAny + cIdent + cDigit -- 6
138 '\55' -> cAny + cIdent + cDigit -- 7
139 '\56' -> cAny + cIdent + cDigit -- 8
140 '\57' -> cAny + cIdent + cDigit -- 9
141 '\58' -> cAny + cSymbol -- :
143 '\60' -> cAny + cSymbol -- <
144 '\61' -> cAny + cSymbol -- =
145 '\62' -> cAny + cSymbol -- >
146 '\63' -> cAny + cSymbol -- ?
147 '\64' -> cAny + cSymbol -- @
148 '\65' -> cAny + cIdent + cUpper -- A
149 '\66' -> cAny + cIdent + cUpper -- B
150 '\67' -> cAny + cIdent + cUpper -- C
151 '\68' -> cAny + cIdent + cUpper -- D
152 '\69' -> cAny + cIdent + cUpper -- E
153 '\70' -> cAny + cIdent + cUpper -- F
154 '\71' -> cAny + cIdent + cUpper -- G
155 '\72' -> cAny + cIdent + cUpper -- H
156 '\73' -> cAny + cIdent + cUpper -- I
157 '\74' -> cAny + cIdent + cUpper -- J
158 '\75' -> cAny + cIdent + cUpper -- K
159 '\76' -> cAny + cIdent + cUpper -- L
160 '\77' -> cAny + cIdent + cUpper -- M
161 '\78' -> cAny + cIdent + cUpper -- N
162 '\79' -> cAny + cIdent + cUpper -- O
163 '\80' -> cAny + cIdent + cUpper -- P
164 '\81' -> cAny + cIdent + cUpper -- Q
165 '\82' -> cAny + cIdent + cUpper -- R
166 '\83' -> cAny + cIdent + cUpper -- S
167 '\84' -> cAny + cIdent + cUpper -- T
168 '\85' -> cAny + cIdent + cUpper -- U
169 '\86' -> cAny + cIdent + cUpper -- V
170 '\87' -> cAny + cIdent + cUpper -- W
171 '\88' -> cAny + cIdent + cUpper -- X
172 '\89' -> cAny + cIdent + cUpper -- Y
173 '\90' -> cAny + cIdent + cUpper -- Z
175 '\92' -> cAny + cSymbol -- backslash
177 '\94' -> cAny + cSymbol -- ^
178 '\95' -> cAny + cIdent + cLower -- _
180 '\97' -> cAny + cIdent + cLower -- a
181 '\98' -> cAny + cIdent + cLower -- b
182 '\99' -> cAny + cIdent + cLower -- c
183 '\100' -> cAny + cIdent + cLower -- d
184 '\101' -> cAny + cIdent + cLower -- e
185 '\102' -> cAny + cIdent + cLower -- f
186 '\103' -> cAny + cIdent + cLower -- g
187 '\104' -> cAny + cIdent + cLower -- h
188 '\105' -> cAny + cIdent + cLower -- i
189 '\106' -> cAny + cIdent + cLower -- j
190 '\107' -> cAny + cIdent + cLower -- k
191 '\108' -> cAny + cIdent + cLower -- l
192 '\109' -> cAny + cIdent + cLower -- m
193 '\110' -> cAny + cIdent + cLower -- n
194 '\111' -> cAny + cIdent + cLower -- o
195 '\112' -> cAny + cIdent + cLower -- p
196 '\113' -> cAny + cIdent + cLower -- q
197 '\114' -> cAny + cIdent + cLower -- r
198 '\115' -> cAny + cIdent + cLower -- s
199 '\116' -> cAny + cIdent + cLower -- t
200 '\117' -> cAny + cIdent + cLower -- u
201 '\118' -> cAny + cIdent + cLower -- v
202 '\119' -> cAny + cIdent + cLower -- w
203 '\120' -> cAny + cIdent + cLower -- x
204 '\121' -> cAny + cIdent + cLower -- y
205 '\122' -> cAny + cIdent + cLower -- z
207 '\124' -> cAny + cSymbol -- |
209 '\126' -> cAny + cSymbol -- ~
244 '\161' -> cAny + cSymbol -- ¡
245 '\162' -> cAny + cSymbol -- ¢
246 '\163' -> cAny + cSymbol -- £
247 '\164' -> cAny + cSymbol -- ¤
248 '\165' -> cAny + cSymbol -- ¥
249 '\166' -> cAny + cSymbol -- ¦
250 '\167' -> cAny + cSymbol -- §
251 '\168' -> cAny + cSymbol -- ¨
252 '\169' -> cAny + cSymbol -- ©
253 '\170' -> cAny + cSymbol -- ª
254 '\171' -> cAny + cSymbol -- «
255 '\172' -> cAny + cSymbol -- ¬
256 '\173' -> cAny + cSymbol --
257 '\174' -> cAny + cSymbol -- ®
258 '\175' -> cAny + cSymbol -- ¯
259 '\176' -> cAny + cSymbol -- °
260 '\177' -> cAny + cSymbol -- ±
261 '\178' -> cAny + cSymbol -- ²
262 '\179' -> cAny + cSymbol -- ³
263 '\180' -> cAny + cSymbol -- ´
264 '\181' -> cAny + cSymbol -- µ
265 '\182' -> cAny + cSymbol -- ¶
266 '\183' -> cAny + cSymbol -- ·
267 '\184' -> cAny + cSymbol -- ¸
268 '\185' -> cAny + cSymbol -- ¹
269 '\186' -> cAny + cSymbol -- º
270 '\187' -> cAny + cSymbol -- »
271 '\188' -> cAny + cSymbol -- ¼
272 '\189' -> cAny + cSymbol -- ½
273 '\190' -> cAny + cSymbol -- ¾
274 '\191' -> cAny + cSymbol -- ¿
275 '\192' -> cAny + cIdent + cUpper -- À
276 '\193' -> cAny + cIdent + cUpper -- Á
277 '\194' -> cAny + cIdent + cUpper -- Â
278 '\195' -> cAny + cIdent + cUpper -- Ã
279 '\196' -> cAny + cIdent + cUpper -- Ä
280 '\197' -> cAny + cIdent + cUpper -- Å
281 '\198' -> cAny + cIdent + cUpper -- Æ
282 '\199' -> cAny + cIdent + cUpper -- Ç
283 '\200' -> cAny + cIdent + cUpper -- È
284 '\201' -> cAny + cIdent + cUpper -- É
285 '\202' -> cAny + cIdent + cUpper -- Ê
286 '\203' -> cAny + cIdent + cUpper -- Ë
287 '\204' -> cAny + cIdent + cUpper -- Ì
288 '\205' -> cAny + cIdent + cUpper -- Í
289 '\206' -> cAny + cIdent + cUpper -- Î
290 '\207' -> cAny + cIdent + cUpper -- Ï
291 '\208' -> cAny + cIdent + cUpper -- Ð
292 '\209' -> cAny + cIdent + cUpper -- Ñ
293 '\210' -> cAny + cIdent + cUpper -- Ò
294 '\211' -> cAny + cIdent + cUpper -- Ó
295 '\212' -> cAny + cIdent + cUpper -- Ô
296 '\213' -> cAny + cIdent + cUpper -- Õ
297 '\214' -> cAny + cIdent + cUpper -- Ö
298 '\215' -> cAny + cSymbol + cLower -- ×
299 '\216' -> cAny + cIdent + cUpper -- Ø
300 '\217' -> cAny + cIdent + cUpper -- Ù
301 '\218' -> cAny + cIdent + cUpper -- Ú
302 '\219' -> cAny + cIdent + cUpper -- Û
303 '\220' -> cAny + cIdent + cUpper -- Ü
304 '\221' -> cAny + cIdent + cUpper -- Ý
305 '\222' -> cAny + cIdent + cUpper -- Þ
306 '\223' -> cAny + cIdent -- ß
307 '\224' -> cAny + cIdent + cLower -- à
308 '\225' -> cAny + cIdent + cLower -- á
309 '\226' -> cAny + cIdent + cLower -- â
310 '\227' -> cAny + cIdent + cLower -- ã
311 '\228' -> cAny + cIdent + cLower -- ä
312 '\229' -> cAny + cIdent + cLower -- å
313 '\230' -> cAny + cIdent + cLower -- æ
314 '\231' -> cAny + cIdent + cLower -- ç
315 '\232' -> cAny + cIdent + cLower -- è
316 '\233' -> cAny + cIdent + cLower -- é
317 '\234' -> cAny + cIdent + cLower -- ê
318 '\235' -> cAny + cIdent + cLower -- ë
319 '\236' -> cAny + cIdent + cLower -- ì
320 '\237' -> cAny + cIdent + cLower -- í
321 '\238' -> cAny + cIdent + cLower -- î
322 '\239' -> cAny + cIdent + cLower -- ï
323 '\240' -> cAny + cIdent + cLower -- ð
324 '\241' -> cAny + cIdent + cLower -- ñ
325 '\242' -> cAny + cIdent + cLower -- ò
326 '\243' -> cAny + cIdent + cLower -- ó
327 '\244' -> cAny + cIdent + cLower -- ô
328 '\245' -> cAny + cIdent + cLower -- õ
329 '\246' -> cAny + cIdent + cLower -- ö
330 '\247' -> cAny + cSymbol -- ÷
331 '\248' -> cAny + cIdent -- ø
332 '\249' -> cAny + cIdent + cLower -- ù
333 '\250' -> cAny + cIdent + cLower -- ú
334 '\251' -> cAny + cIdent + cLower -- û
335 '\252' -> cAny + cIdent + cLower -- ü
336 '\253' -> cAny + cIdent + cLower -- ý
337 '\254' -> cAny + cIdent + cLower -- þ
338 '\255' -> cAny + cIdent + cLower -- ÿ