2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
4 \section[Lexical analysis]{Lexical analysis}
7 #include "HsVersions.h"
11 isLexCon, isLexVar, isLexId, isLexSym,
12 isLexConId, isLexConSym, isLexVarId, isLexVarSym,
16 IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError
21 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper))
23 import CmdLineOpts ( opt_IgnoreIfacePragmas )
24 import Demand ( Demand {- instance Read -} )
25 import FiniteMap ( FiniteMap, listToFM, lookupFM )
26 import Maybes ( Maybe(..), MaybeErr(..) )
28 import CharSeq ( CSeq )
29 import ErrUtils ( Error(..) )
30 import Outputable ( Outputable(..) )
31 import PprStyle ( PprStyle(..) )
32 import Util ( nOfThem, panic )
36 %************************************************************************
38 \subsection{Lexical categories}
40 %************************************************************************
42 These functions test strings to see if they fit the lexical categories
43 defined in the Haskell report. Normally applied as in e.g. @isCon
47 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
48 isLexVarId, isLexVarSym :: FAST_STRING -> Bool
50 isLexCon cs = isLexConId cs || isLexConSym cs
51 isLexVar cs = isLexVarId cs || isLexVarSym cs
53 isLexId cs = isLexConId cs || isLexVarId cs
54 isLexSym cs = isLexConSym cs || isLexVarSym cs
60 | cs == SLIT("[]") = True
61 | c == '(' = True -- (), (,), (,,), ...
62 | otherwise = isUpper c || isUpperISO c
68 | otherwise = isLower c || isLowerISO c
74 | otherwise = c == ':'
81 | otherwise = isSymbolASCII c
87 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
88 isSymbolISO c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
89 isUpperISO c = 0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
90 isLowerISO c = 0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
94 %************************************************************************
96 \subsection{Tuple strings -- ugh!}
98 %************************************************************************
101 mkTupNameStr 0 = SLIT("()")
102 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
103 mkTupNameStr 2 = _PK_ "(,)" -- not strictly necessary
104 mkTupNameStr 3 = _PK_ "(,,)" -- ditto
105 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
106 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
111 %************************************************************************
113 \subsection{Data types}
115 %************************************************************************
119 = ITinterface -- keywords
139 | ITbang -- magic symbols
156 | ITvarid FAST_STRING
157 | ITconid FAST_STRING
158 | ITvarsym FAST_STRING
159 | ITconsym FAST_STRING
160 | ITqvarid (FAST_STRING,FAST_STRING)
161 | ITqconid (FAST_STRING,FAST_STRING)
162 | ITqvarsym (FAST_STRING,FAST_STRING)
163 | ITqconsym (FAST_STRING,FAST_STRING)
165 -- Stuff for reading unfoldings
166 | ITarity | ITstrict | ITunfold
167 | ITdemand [Demand] | ITbottom
168 | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
169 | ITcoerce_in | ITcoerce_out | ITatsign
170 | ITccall (Bool,Bool) -- (is_casm, may_gc)
172 | ITchar Char | ITstring FAST_STRING
173 | ITinteger Integer | ITdouble Double
174 | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
175 deriving Text -- debugging
178 %************************************************************************
180 \subsection{The lexical analyser}
182 %************************************************************************
185 lexIface :: String -> [IfaceToken]
192 -- whitespace and comments
193 ' ' : cs -> lexIface cs
194 '\t' : cs -> lexIface cs
195 '\n' : cs -> lexIface cs
196 '-' : '-' : cs -> lex_comment cs
198 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
199 -- '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
201 '(' : '.' : '.' : ')' : cs -> ITdotdot : lexIface cs
202 '{' : cs -> ITocurly : lexIface cs
203 '}' : cs -> ITccurly : lexIface cs
204 '(' : ',' : cs -> lex_tuple Nothing cs
205 '(' : ')' : cs -> ITconid SLIT("()") : lexIface cs
206 '(' : cs -> IToparen : lexIface cs
207 ')' : cs -> ITcparen : lexIface cs
208 '[' : ']' : cs -> ITconid SLIT("[]") : lexIface cs
209 '[' : cs -> ITobrack : lexIface cs
210 ']' : cs -> ITcbrack : lexIface cs
211 ',' : cs -> ITcomma : lexIface cs
212 ':' : ':' : cs -> ITdcolon : lexIface cs
213 ';' : cs -> ITsemi : lexIface cs
214 '\"' : cs -> case reads input of
215 [(str, rest)] -> ITstring (_PK_ (str::String)) : lexIface rest
216 '\'' : cs -> case reads input of
217 [(ch, rest)] -> ITchar ch : lexIface rest
219 -- ``thingy'' form for casm
220 '`' : '`' : cs -> lex_cstring "" cs
223 '_' : 'S' : '_' : cs -> ITstrict : lex_demand cs
224 '_' : cs -> lex_keyword cs
227 '-' : c : cs | isDigit c -> lex_num "-" (c:cs)
228 c : cs | isDigit c -> lex_num "" (c:cs)
230 other -> lex_id input
233 = case (span ((/=) '\n') str) of { (junk, rest) ->
237 lex_demand (c:cs) | isSpace c = lex_demand cs
238 | otherwise = case readList (c:cs) of
239 ((demand,rest) : _) -> ITdemand demand : lexIface rest
243 = case (span isDigit str) of { (num, rest) ->
245 '.' : str2 -> case (span isDigit str2) of { (num2,rest2) ->
246 ITdouble (read (minus ++ num ++ ('.':num2))) : lexIface rest2
249 other -> ITinteger (read (minus ++ num)) : lexIface rest
254 = case (span is_kwd_mod_char str) of { (kw, rest) ->
255 case (lookupFM ifaceKeywordsFM kw) of
256 Nothing -> panic ("lex_keyword:"++str)
258 Just xx | startDiscard xx &&
259 opt_IgnoreIfacePragmas -> lexIface (doDiscard rest)
260 | otherwise -> xx : lexIface rest
263 is_kwd_mod_char c = isAlphanum c || c `elem` "_@/\\"
266 lex_cstring so_far ('\'' : '\'' : cs) = ITstring (_PK_ (reverse (so_far::String))) : lexIface cs
267 lex_cstring so_far (c : cs) = lex_cstring (c:so_far) cs
271 lex_tuple module_dot orig_cs = go 2 orig_cs
273 go n (',':cs) = go (n+1) cs
274 go n (')':cs) = end_lex_id module_dot (ITconid (mkTupNameStr n)) cs
275 go n other = panic ("lex_tuple" ++ orig_cs)
277 -- Similarly ' itself is ok inside an identifier, but not at the start
278 is_id_char c = isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
282 go xs (f :cs) | is_kwd_mod_char f = go (f : xs) cs
283 go xs ('.':cs) | not (null xs) = lex_id2 (Just (_PK_ (reverse xs))) [] cs
284 go xs cs = lex_id2 Nothing xs cs
286 -- Dealt with the Module.part
287 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
288 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
289 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
290 lex_id2 module_dot [] (':' : cs) = lex_id3 module_dot [':'] cs
291 lex_id2 module_dot xs cs = lex_id3 module_dot xs cs
293 -- Dealt with [], (), : special cases
294 lex_id3 module_dot xs (f:cs) | is_id_char f = lex_id3 module_dot (f : xs) cs
296 lex_id3 Nothing xs rest = case lookupFM haskellKeywordsFM rxs of
297 Just kwd_token -> kwd_token : lexIface rest
298 other -> (mk_var_token rxs) : lexIface rest
302 lex_id3 (Just m) xs rest = end_lex_id (Just m) (mk_var_token (reverse xs)) rest
304 mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
305 | f == ':' = ITconsym n
306 | isAlpha f = ITvarid n
307 | otherwise = ITvarsym n
311 end_lex_id (Just m) (ITconid n) cs = ITqconid (m,n) : lexIface cs
312 end_lex_id (Just m) (ITvarid n) cs = ITqvarid (m,n) : lexIface cs
313 end_lex_id (Just m) (ITconsym n) cs = ITqconsym (m,n): lexIface cs
314 end_lex_id (Just m) (ITvarsym n) cs = ITqvarsym (m,n): lexIface cs
315 end_lex_id (Just m) ITbang cs = ITqvarsym (m,SLIT("!")) : lexIface cs
316 end_lex_id (Just m) token cs = panic ("end_lex_id:" ++ show token)
317 end_lex_id Nothing token cs = token : lexIface cs
320 ifaceKeywordsFM :: FiniteMap String IfaceToken
321 ifaceKeywordsFM = listToFM [
324 ,("interface_", ITinterface)
325 ,("usages_", ITusages)
326 ,("versions_", ITversions)
327 ,("exports_", ITexports)
328 ,("instance_modules_", ITinstance_modules)
329 ,("instances_", ITinstances)
330 ,("fixities_", ITfixities)
331 ,("declarations_", ITdeclarations)
332 ,("pragmas_", ITpragmas)
333 ,("forall_", ITforall)
336 ,("coerce_in_", ITcoerce_in)
337 ,("coerce_out_", ITcoerce_out)
339 ,("integer_", ITinteger_lit)
340 ,("rational_", ITrational_lit)
341 ,("addr_", ITaddr_lit)
342 ,("float_", ITfloat_lit)
343 ,("string_", ITstring_lit)
344 ,("litlit_", ITlit_lit)
345 ,("ccall_", ITccall (False, False))
346 ,("ccall_GC_", ITccall (False, True))
347 ,("casm_", ITccall (True, False))
348 ,("casm_GC_", ITccall (True, True))
351 haskellKeywordsFM = listToFM [
354 ,("newtype", ITnewtype)
357 ,("instance", ITinstance)
358 ,("infixl", ITinfixl)
359 ,("infixr", ITinfixr)
362 ,("case#", ITprim_case)
366 ,("letrec", ITletrec)
367 ,("deriving", ITderiving)
377 startDiscard ITarity = True
378 startDiscard ITunfold = True
379 startDiscard ITstrict = True
380 startDiscard other = False
382 -- doDiscard rips along really fast looking for a double semicolon,
383 -- indicating the end of the pragma we're skipping
384 doDiscard rest@(';' : ';' : _) = rest
385 doDiscard ( _ : rest) = doDiscard rest
390 %************************************************************************
392 \subsection{Other utility functions
394 %************************************************************************
397 type IfM a = MaybeErr a Error
399 returnIf :: a -> IfM a
400 thenIf :: IfM a -> (a -> IfM b) -> IfM b
401 happyError :: Int -> [IfaceToken] -> IfM a
403 returnIf a = Succeeded a
405 thenIf (Succeeded a) k = k a
406 thenIf (Failed err) _ = Failed err
408 happyError ln toks = Failed (ifaceParseErr ln toks)
410 -----------------------------------------------------------------
412 ifaceParseErr ln toks sty
413 = ppCat [ppPStr SLIT("Interface-file parse error: line"), ppInt ln, ppStr "toks=", ppStr (show (take 10 toks))]