1 Character classification
5 ( is_ident -- Char# -> Bool
6 , is_symbol -- Char# -> Bool
7 , is_any -- Char# -> Bool
8 , is_space -- Char# -> Bool
9 , is_lower -- Char# -> Bool
10 , is_upper -- Char# -> Bool
11 , is_digit -- Char# -> Bool
12 , is_alphanum -- Char# -> Bool
14 , is_decdigit, is_hexdigit, is_octdigit
15 , hexDigit, octDecDigit
18 #include "HsVersions.h"
20 import Data.Int ( Int32 )
21 import Data.Bits ( Bits((.&.)) )
22 import Data.Char ( ord, chr )
29 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
39 The predicates below look costly, but aren't, GHC+GCC do a great job
40 at the big case below.
43 {-# INLINE is_ctype #-}
44 is_ctype :: Int -> Char -> Bool
45 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
47 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
48 is_alphanum :: Char -> Bool
49 is_ident = is_ctype cIdent
50 is_symbol = is_ctype cSymbol
51 is_any = is_ctype cAny
52 is_space = is_ctype cSpace
53 is_lower = is_ctype cLower
54 is_upper = is_ctype cUpper
55 is_digit = is_ctype cDigit
56 is_alphanum = is_ctype (cLower+cUpper+cDigit)
62 hexDigit :: Char -> Int
63 hexDigit c | is_decdigit c = ord c - ord '0'
64 | otherwise = ord (to_lower c) - ord 'a' + 10
66 octDecDigit :: Char -> Int
67 octDecDigit c = ord c - ord '0'
69 is_decdigit :: Char -> Bool
71 = c >= '0' && c <= '9'
73 is_hexdigit :: Char -> Bool
76 || (c >= 'a' && c <= 'f')
77 || (c >= 'A' && c <= 'F')
79 is_octdigit :: Char -> Bool
80 is_octdigit c = c >= '0' && c <= '7'
82 to_lower :: Char -> Char
84 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
88 We really mean .|. instead of + below, but GHC currently doesn't do
89 any constant folding with bitops. *sigh*
92 charType :: Char -> Int
93 charType c = case c of
103 '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
104 '\10' -> cSpace -- \n (ditto)
105 '\11' -> cSpace -- \v (ditto)
106 '\12' -> cSpace -- \f (ditto)
107 '\13' -> cSpace -- ^M (ditto)
126 '\32' -> cAny + cSpace --
127 '\33' -> cAny + cSymbol -- !
129 '\35' -> cAny + cSymbol -- #
130 '\36' -> cAny + cSymbol -- $
131 '\37' -> cAny + cSymbol -- %
132 '\38' -> cAny + cSymbol -- &
133 '\39' -> cAny + cIdent -- '
136 '\42' -> cAny + cSymbol -- *
137 '\43' -> cAny + cSymbol -- +
139 '\45' -> cAny + cSymbol -- -
140 '\46' -> cAny + cSymbol -- .
141 '\47' -> cAny + cSymbol -- /
142 '\48' -> cAny + cIdent + cDigit -- 0
143 '\49' -> cAny + cIdent + cDigit -- 1
144 '\50' -> cAny + cIdent + cDigit -- 2
145 '\51' -> cAny + cIdent + cDigit -- 3
146 '\52' -> cAny + cIdent + cDigit -- 4
147 '\53' -> cAny + cIdent + cDigit -- 5
148 '\54' -> cAny + cIdent + cDigit -- 6
149 '\55' -> cAny + cIdent + cDigit -- 7
150 '\56' -> cAny + cIdent + cDigit -- 8
151 '\57' -> cAny + cIdent + cDigit -- 9
152 '\58' -> cAny + cSymbol -- :
154 '\60' -> cAny + cSymbol -- <
155 '\61' -> cAny + cSymbol -- =
156 '\62' -> cAny + cSymbol -- >
157 '\63' -> cAny + cSymbol -- ?
158 '\64' -> cAny + cSymbol -- @
159 '\65' -> cAny + cIdent + cUpper -- A
160 '\66' -> cAny + cIdent + cUpper -- B
161 '\67' -> cAny + cIdent + cUpper -- C
162 '\68' -> cAny + cIdent + cUpper -- D
163 '\69' -> cAny + cIdent + cUpper -- E
164 '\70' -> cAny + cIdent + cUpper -- F
165 '\71' -> cAny + cIdent + cUpper -- G
166 '\72' -> cAny + cIdent + cUpper -- H
167 '\73' -> cAny + cIdent + cUpper -- I
168 '\74' -> cAny + cIdent + cUpper -- J
169 '\75' -> cAny + cIdent + cUpper -- K
170 '\76' -> cAny + cIdent + cUpper -- L
171 '\77' -> cAny + cIdent + cUpper -- M
172 '\78' -> cAny + cIdent + cUpper -- N
173 '\79' -> cAny + cIdent + cUpper -- O
174 '\80' -> cAny + cIdent + cUpper -- P
175 '\81' -> cAny + cIdent + cUpper -- Q
176 '\82' -> cAny + cIdent + cUpper -- R
177 '\83' -> cAny + cIdent + cUpper -- S
178 '\84' -> cAny + cIdent + cUpper -- T
179 '\85' -> cAny + cIdent + cUpper -- U
180 '\86' -> cAny + cIdent + cUpper -- V
181 '\87' -> cAny + cIdent + cUpper -- W
182 '\88' -> cAny + cIdent + cUpper -- X
183 '\89' -> cAny + cIdent + cUpper -- Y
184 '\90' -> cAny + cIdent + cUpper -- Z
186 '\92' -> cAny + cSymbol -- backslash
188 '\94' -> cAny + cSymbol -- ^
189 '\95' -> cAny + cIdent + cLower -- _
191 '\97' -> cAny + cIdent + cLower -- a
192 '\98' -> cAny + cIdent + cLower -- b
193 '\99' -> cAny + cIdent + cLower -- c
194 '\100' -> cAny + cIdent + cLower -- d
195 '\101' -> cAny + cIdent + cLower -- e
196 '\102' -> cAny + cIdent + cLower -- f
197 '\103' -> cAny + cIdent + cLower -- g
198 '\104' -> cAny + cIdent + cLower -- h
199 '\105' -> cAny + cIdent + cLower -- i
200 '\106' -> cAny + cIdent + cLower -- j
201 '\107' -> cAny + cIdent + cLower -- k
202 '\108' -> cAny + cIdent + cLower -- l
203 '\109' -> cAny + cIdent + cLower -- m
204 '\110' -> cAny + cIdent + cLower -- n
205 '\111' -> cAny + cIdent + cLower -- o
206 '\112' -> cAny + cIdent + cLower -- p
207 '\113' -> cAny + cIdent + cLower -- q
208 '\114' -> cAny + cIdent + cLower -- r
209 '\115' -> cAny + cIdent + cLower -- s
210 '\116' -> cAny + cIdent + cLower -- t
211 '\117' -> cAny + cIdent + cLower -- u
212 '\118' -> cAny + cIdent + cLower -- v
213 '\119' -> cAny + cIdent + cLower -- w
214 '\120' -> cAny + cIdent + cLower -- x
215 '\121' -> cAny + cIdent + cLower -- y
216 '\122' -> cAny + cIdent + cLower -- z
218 '\124' -> cAny + cSymbol -- |
220 '\126' -> cAny + cSymbol -- ~
222 _ -> panic ("charType: " ++ show c)