1 Character classification
5 -- The above warning supression flag is a temporary kludge.
6 -- While working on this module you are encouraged to remove it and fix
7 -- any warnings in the module. See
8 -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
12 ( is_ident -- Char# -> Bool
13 , is_symbol -- Char# -> Bool
14 , is_any -- Char# -> Bool
15 , is_space -- Char# -> Bool
16 , is_lower -- Char# -> Bool
17 , is_upper -- Char# -> Bool
18 , is_digit -- Char# -> Bool
19 , is_alphanum -- Char# -> Bool
21 , is_hexdigit, is_octdigit
22 , hexDigit, octDecDigit
25 #include "HsVersions.h"
27 import Data.Int ( Int32 )
28 import Data.Bits ( Bits((.&.)) )
29 import Data.Char ( ord, chr )
35 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
45 The predicates below look costly, but aren't, GHC+GCC do a great job
46 at the big case below.
49 {-# INLINE is_ctype #-}
50 is_ctype :: Int -> Char -> Bool
51 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
53 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit :: Char -> Bool
54 is_ident = is_ctype cIdent
55 is_symbol = is_ctype cSymbol
56 is_any = is_ctype cAny
57 is_space = is_ctype cSpace
58 is_lower = is_ctype cLower
59 is_upper = is_ctype cUpper
60 is_digit = is_ctype cDigit
61 is_alphanum = is_ctype (cLower+cUpper+cDigit)
67 hexDigit :: Char -> Int
68 hexDigit c | is_digit c = ord c - ord '0'
69 | otherwise = ord (to_lower c) - ord 'a' + 10
71 octDecDigit :: Char -> Int
72 octDecDigit c = ord c - ord '0'
76 || (c >= 'a' && c <= 'f')
77 || (c >= 'A' && c <= 'F')
79 is_octdigit c = c >= '0' && c <= '7'
82 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
86 We really mean .|. instead of + below, but GHC currently doesn't do
87 any constant folding with bitops. *sigh*
90 charType :: Char -> Int
91 charType c = case c of
101 '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
102 '\10' -> cSpace -- \n (ditto)
103 '\11' -> cSpace -- \v (ditto)
104 '\12' -> cSpace -- \f (ditto)
105 '\13' -> cSpace -- ^M (ditto)
124 '\32' -> cAny + cSpace --
125 '\33' -> cAny + cSymbol -- !
127 '\35' -> cAny + cSymbol -- #
128 '\36' -> cAny + cSymbol -- $
129 '\37' -> cAny + cSymbol -- %
130 '\38' -> cAny + cSymbol -- &
131 '\39' -> cAny + cIdent -- '
134 '\42' -> cAny + cSymbol -- *
135 '\43' -> cAny + cSymbol -- +
137 '\45' -> cAny + cSymbol -- -
138 '\46' -> cAny + cSymbol -- .
139 '\47' -> cAny + cSymbol -- /
140 '\48' -> cAny + cIdent + cDigit -- 0
141 '\49' -> cAny + cIdent + cDigit -- 1
142 '\50' -> cAny + cIdent + cDigit -- 2
143 '\51' -> cAny + cIdent + cDigit -- 3
144 '\52' -> cAny + cIdent + cDigit -- 4
145 '\53' -> cAny + cIdent + cDigit -- 5
146 '\54' -> cAny + cIdent + cDigit -- 6
147 '\55' -> cAny + cIdent + cDigit -- 7
148 '\56' -> cAny + cIdent + cDigit -- 8
149 '\57' -> cAny + cIdent + cDigit -- 9
150 '\58' -> cAny + cSymbol -- :
152 '\60' -> cAny + cSymbol -- <
153 '\61' -> cAny + cSymbol -- =
154 '\62' -> cAny + cSymbol -- >
155 '\63' -> cAny + cSymbol -- ?
156 '\64' -> cAny + cSymbol -- @
157 '\65' -> cAny + cIdent + cUpper -- A
158 '\66' -> cAny + cIdent + cUpper -- B
159 '\67' -> cAny + cIdent + cUpper -- C
160 '\68' -> cAny + cIdent + cUpper -- D
161 '\69' -> cAny + cIdent + cUpper -- E
162 '\70' -> cAny + cIdent + cUpper -- F
163 '\71' -> cAny + cIdent + cUpper -- G
164 '\72' -> cAny + cIdent + cUpper -- H
165 '\73' -> cAny + cIdent + cUpper -- I
166 '\74' -> cAny + cIdent + cUpper -- J
167 '\75' -> cAny + cIdent + cUpper -- K
168 '\76' -> cAny + cIdent + cUpper -- L
169 '\77' -> cAny + cIdent + cUpper -- M
170 '\78' -> cAny + cIdent + cUpper -- N
171 '\79' -> cAny + cIdent + cUpper -- O
172 '\80' -> cAny + cIdent + cUpper -- P
173 '\81' -> cAny + cIdent + cUpper -- Q
174 '\82' -> cAny + cIdent + cUpper -- R
175 '\83' -> cAny + cIdent + cUpper -- S
176 '\84' -> cAny + cIdent + cUpper -- T
177 '\85' -> cAny + cIdent + cUpper -- U
178 '\86' -> cAny + cIdent + cUpper -- V
179 '\87' -> cAny + cIdent + cUpper -- W
180 '\88' -> cAny + cIdent + cUpper -- X
181 '\89' -> cAny + cIdent + cUpper -- Y
182 '\90' -> cAny + cIdent + cUpper -- Z
184 '\92' -> cAny + cSymbol -- backslash
186 '\94' -> cAny + cSymbol -- ^
187 '\95' -> cAny + cIdent + cLower -- _
189 '\97' -> cAny + cIdent + cLower -- a
190 '\98' -> cAny + cIdent + cLower -- b
191 '\99' -> cAny + cIdent + cLower -- c
192 '\100' -> cAny + cIdent + cLower -- d
193 '\101' -> cAny + cIdent + cLower -- e
194 '\102' -> cAny + cIdent + cLower -- f
195 '\103' -> cAny + cIdent + cLower -- g
196 '\104' -> cAny + cIdent + cLower -- h
197 '\105' -> cAny + cIdent + cLower -- i
198 '\106' -> cAny + cIdent + cLower -- j
199 '\107' -> cAny + cIdent + cLower -- k
200 '\108' -> cAny + cIdent + cLower -- l
201 '\109' -> cAny + cIdent + cLower -- m
202 '\110' -> cAny + cIdent + cLower -- n
203 '\111' -> cAny + cIdent + cLower -- o
204 '\112' -> cAny + cIdent + cLower -- p
205 '\113' -> cAny + cIdent + cLower -- q
206 '\114' -> cAny + cIdent + cLower -- r
207 '\115' -> cAny + cIdent + cLower -- s
208 '\116' -> cAny + cIdent + cLower -- t
209 '\117' -> cAny + cIdent + cLower -- u
210 '\118' -> cAny + cIdent + cLower -- v
211 '\119' -> cAny + cIdent + cLower -- w
212 '\120' -> cAny + cIdent + cLower -- x
213 '\121' -> cAny + cIdent + cLower -- y
214 '\122' -> cAny + cIdent + cLower -- z
216 '\124' -> cAny + cSymbol -- |
218 '\126' -> cAny + cSymbol -- ~
253 '\161' -> cAny + cSymbol -- ¡
254 '\162' -> cAny + cSymbol -- ¢
255 '\163' -> cAny + cSymbol -- £
256 '\164' -> cAny + cSymbol -- ¤
257 '\165' -> cAny + cSymbol -- ¥
258 '\166' -> cAny + cSymbol -- ¦
259 '\167' -> cAny + cSymbol -- §
260 '\168' -> cAny + cSymbol -- ¨
261 '\169' -> cAny + cSymbol -- ©
262 '\170' -> cAny + cSymbol -- ª
263 '\171' -> cAny + cSymbol -- «
264 '\172' -> cAny + cSymbol -- ¬
265 '\173' -> cAny + cSymbol --
266 '\174' -> cAny + cSymbol -- ®
267 '\175' -> cAny + cSymbol -- ¯
268 '\176' -> cAny + cSymbol -- °
269 '\177' -> cAny + cSymbol -- ±
270 '\178' -> cAny + cSymbol -- ²
271 '\179' -> cAny + cSymbol -- ³
272 '\180' -> cAny + cSymbol -- ´
273 '\181' -> cAny + cSymbol -- µ
274 '\182' -> cAny + cSymbol -- ¶
275 '\183' -> cAny + cSymbol -- ·
276 '\184' -> cAny + cSymbol -- ¸
277 '\185' -> cAny + cSymbol -- ¹
278 '\186' -> cAny + cSymbol -- º
279 '\187' -> cAny + cSymbol -- »
280 '\188' -> cAny + cSymbol -- ¼
281 '\189' -> cAny + cSymbol -- ½
282 '\190' -> cAny + cSymbol -- ¾
283 '\191' -> cAny + cSymbol -- ¿
284 '\192' -> cAny + cIdent + cUpper -- À
285 '\193' -> cAny + cIdent + cUpper -- Á
286 '\194' -> cAny + cIdent + cUpper -- Â
287 '\195' -> cAny + cIdent + cUpper -- Ã
288 '\196' -> cAny + cIdent + cUpper -- Ä
289 '\197' -> cAny + cIdent + cUpper -- Å
290 '\198' -> cAny + cIdent + cUpper -- Æ
291 '\199' -> cAny + cIdent + cUpper -- Ç
292 '\200' -> cAny + cIdent + cUpper -- È
293 '\201' -> cAny + cIdent + cUpper -- É
294 '\202' -> cAny + cIdent + cUpper -- Ê
295 '\203' -> cAny + cIdent + cUpper -- Ë
296 '\204' -> cAny + cIdent + cUpper -- Ì
297 '\205' -> cAny + cIdent + cUpper -- Í
298 '\206' -> cAny + cIdent + cUpper -- Î
299 '\207' -> cAny + cIdent + cUpper -- Ï
300 '\208' -> cAny + cIdent + cUpper -- Ð
301 '\209' -> cAny + cIdent + cUpper -- Ñ
302 '\210' -> cAny + cIdent + cUpper -- Ò
303 '\211' -> cAny + cIdent + cUpper -- Ó
304 '\212' -> cAny + cIdent + cUpper -- Ô
305 '\213' -> cAny + cIdent + cUpper -- Õ
306 '\214' -> cAny + cIdent + cUpper -- Ö
307 '\215' -> cAny + cSymbol + cLower -- ×
308 '\216' -> cAny + cIdent + cUpper -- Ø
309 '\217' -> cAny + cIdent + cUpper -- Ù
310 '\218' -> cAny + cIdent + cUpper -- Ú
311 '\219' -> cAny + cIdent + cUpper -- Û
312 '\220' -> cAny + cIdent + cUpper -- Ü
313 '\221' -> cAny + cIdent + cUpper -- Ý
314 '\222' -> cAny + cIdent + cUpper -- Þ
315 '\223' -> cAny + cIdent -- ß
316 '\224' -> cAny + cIdent + cLower -- à
317 '\225' -> cAny + cIdent + cLower -- á
318 '\226' -> cAny + cIdent + cLower -- â
319 '\227' -> cAny + cIdent + cLower -- ã
320 '\228' -> cAny + cIdent + cLower -- ä
321 '\229' -> cAny + cIdent + cLower -- å
322 '\230' -> cAny + cIdent + cLower -- æ
323 '\231' -> cAny + cIdent + cLower -- ç
324 '\232' -> cAny + cIdent + cLower -- è
325 '\233' -> cAny + cIdent + cLower -- é
326 '\234' -> cAny + cIdent + cLower -- ê
327 '\235' -> cAny + cIdent + cLower -- ë
328 '\236' -> cAny + cIdent + cLower -- ì
329 '\237' -> cAny + cIdent + cLower -- í
330 '\238' -> cAny + cIdent + cLower -- î
331 '\239' -> cAny + cIdent + cLower -- ï
332 '\240' -> cAny + cIdent + cLower -- ð
333 '\241' -> cAny + cIdent + cLower -- ñ
334 '\242' -> cAny + cIdent + cLower -- ò
335 '\243' -> cAny + cIdent + cLower -- ó
336 '\244' -> cAny + cIdent + cLower -- ô
337 '\245' -> cAny + cIdent + cLower -- õ
338 '\246' -> cAny + cIdent + cLower -- ö
339 '\247' -> cAny + cSymbol -- ÷
340 '\248' -> cAny + cIdent -- ø
341 '\249' -> cAny + cIdent + cLower -- ù
342 '\250' -> cAny + cIdent + cLower -- ú
343 '\251' -> cAny + cIdent + cLower -- û
344 '\252' -> cAny + cIdent + cLower -- ü
345 '\253' -> cAny + cIdent + cLower -- ý
346 '\254' -> cAny + cIdent + cLower -- þ
347 '\255' -> cAny + cIdent + cLower -- ÿ