2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
33 #include "HsVersions.h"
35 import Char ( ord, isSpace )
36 import List ( isSuffixOf )
38 import IdInfo ( InlinePragInfo(..), CprInfo(..) )
39 import Name ( isLowerISO, isUpperISO )
40 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
41 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
42 import Demand ( Demand(..) {- instance Read -} )
43 import UniqFM ( UniqFM, listToUFM, lookupUFM)
44 import BasicTypes ( NewOrData(..) )
45 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile )
47 import Maybes ( MaybeErr(..) )
48 import ErrUtils ( Message )
56 #if __GLASGOW_HASKELL__ >= 303
62 import PrelRead ( readRational__ ) -- Glasgow non-std
65 %************************************************************************
67 \subsection{Data types}
69 %************************************************************************
71 The token data type, fairly un-interesting except from one
72 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
73 strictness, unfolding etc).
75 The Idea/Observation here is that the renamer needs to scan through
76 all of an interface file before it can continue. But only a fraction
77 of the information contained in the file turns out to be useful, so
78 delaying as much as possible of the scanning and parsing of an
79 interface file Makes Sense (Heap profiles of the compiler
80 show a reduction in heap usage by at least a factor of two,
83 Hence, the interface file lexer spots when value declarations are
84 being scanned and return the @ITidinfo@ and @ITtype@ constructors
85 for the type and any other id info for that binding (unfolding, strictness
86 etc). These constructors are applied to the result of lexing these sub-chunks.
88 The lexing of the type and id info is all done lazily, of course, so
89 the scanning (and subsequent parsing) will be done *only* on the ids the
90 renamer finds out that it is interested in. The rest will just be junked.
91 Laziness, you know it makes sense :-)
95 = ITcase -- Haskell keywords
120 | ITinterface -- GHC-extension keywords
128 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
138 | ITonce -- usage annotations
144 | ITunfold InlinePragInfo
145 | ITstrict ([Demand], Bool)
146 | ITcprinfo (CprInfo)
150 | ITdotdot -- reserved symbols
163 | ITbiglam -- GHC-extension symbols
165 | ITocurly -- special symbols
176 | ITvarid FAST_STRING -- identifiers
177 | ITconid FAST_STRING
178 | ITvarsym FAST_STRING
179 | ITconsym FAST_STRING
180 | ITqvarid (FAST_STRING,FAST_STRING)
181 | ITqconid (FAST_STRING,FAST_STRING)
182 | ITqvarsym (FAST_STRING,FAST_STRING)
183 | ITqconsym (FAST_STRING,FAST_STRING)
185 | ITpragma StringBuffer
188 | ITstring FAST_STRING
190 | ITrational Rational
192 | ITunknown String -- Used when the lexer can't make sense of it
193 | ITeof -- end of file token
194 deriving Text -- debugging
197 %************************************************************************
199 \subsection{The lexical analyser}
201 %************************************************************************
204 lexIface :: (IfaceToken -> IfM a) -> IfM a
207 -- if bufferExhausted buf then
210 -- trace ("Lexer: '"++[C# (currentChar# buf)]++"'") $
211 case currentChar# buf of
212 -- whitespace and comments, ignore.
213 ' '# -> lexIface cont (stepOn buf)
214 '\t'# -> lexIface cont (stepOn buf)
215 '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
217 -- Numbers and comments
219 case lookAhead# buf 1# of
220 -- '-'# -> lex_comment cont (stepOnBy# buf 2#)
223 then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
224 else lex_sym cont buf
226 '{'# -> -- look for "{-##" special iface pragma
227 case lookAhead# buf 1# of
228 '-'# -> case lookAhead# buf 2# of
229 '#'# -> case lookAhead# buf 3# of
232 = doDiscard False (stepOnBy# buf 4#) in
233 cont (ITpragma lexeme) buf'
234 _ -> lex_nested_comment (lexIface cont) buf
235 _ -> cont ITocurly (stepOn buf)
236 -- lex_nested_comment (lexIface cont) buf
237 _ -> cont ITocurly (stepOn buf)
239 -- special symbols ----------------------------------------------------
241 case prefixMatch (stepOn buf) "..)" of
242 Just buf' -> cont ITdotdot (stepOverLexeme buf')
244 case lookAhead# buf 1# of
245 '#'# -> cont IToubxparen (stepOnBy# buf 2#)
246 _ -> cont IToparen (stepOn buf)
247 ')'# -> cont ITcparen (stepOn buf)
248 '}'# -> cont ITccurly (stepOn buf)
249 '#'# -> case lookAhead# buf 1# of
250 ')'# -> cont ITcubxparen (stepOnBy# buf 2#)
251 _ -> lex_sym cont (incLexeme buf)
252 '['# -> cont ITobrack (stepOn buf)
253 ']'# -> cont ITcbrack (stepOn buf)
254 ','# -> cont ITcomma (stepOn buf)
255 ';'# -> cont ITsemi (stepOn buf)
257 -- strings/characters -------------------------------------------------
258 '\"'#{-"-} -> case untilEndOfString# (stepOn buf) of
260 -- the string literal does *not* include the dquotes
261 case lexemeToFastString buf' of
262 v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
265 -- untilEndOfChar# extends the current lexeme until
266 -- it hits a non-escaped single quote. The lexeme of the
267 -- StringBuffer returned does *not* include the closing quote,
268 -- hence we augment the lexeme and make sure to add the
269 -- starting quote, before `read'ing the string.
271 case untilEndOfChar# (stepOn buf) of
272 buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
273 [ (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
275 -- strictness and cpr pragmas and __scc treated specially.
277 case lookAhead# buf 1# of
278 '_'# -> case lookAhead# buf 2# of
280 lex_demand cont (stepOnUntil (not . isSpace)
281 (stepOnBy# buf 3#)) -- past __S
283 lex_cpr cont (stepOnUntil (not . isSpace)
284 (stepOnBy# buf 3#)) -- past __M
286 case prefixMatch (stepOnBy# buf 3#) "cc" of
287 Just buf' -> lex_scc cont (stepOverLexeme buf')
288 Nothing -> lex_id cont buf
292 -- ``thingy'' form for casm
294 case lookAhead# buf 1# of
295 '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `s and go.
296 _ -> lex_sym cont (incLexeme buf) -- add ` to lexeme and assume
297 -- scanning an id of some sort.
300 if bufferExhausted (stepOn buf) then
303 trace "lexIface: misplaced NUL?" $
304 cont (ITunknown "\NUL") (stepOn buf)
306 c | is_digit c -> lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
307 | is_symbol c -> lex_sym cont buf
308 | is_upper c -> lex_con cont buf
309 | is_ident c -> lex_id cont buf
312 lex_comment cont buf =
313 -- _trace ("comment: "++[C# (currentChar# buf)]) $
314 case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
316 -------------------------------------------------------------------------------
318 lex_nested_comment cont buf =
319 case currentChar# buf of
320 '-'# -> case lookAhead# buf 1# of
321 '}'# -> cont (stepOnBy# buf 2#)
322 _ -> lex_nested_comment cont (stepOn buf)
324 '{'# -> case lookAhead# buf 1# of
325 '-'# -> lex_nested_comment
326 (lex_nested_comment cont)
328 _ -> lex_nested_comment cont (stepOn buf)
330 _ -> lex_nested_comment cont (stepOn buf)
332 -------------------------------------------------------------------------------
334 lex_demand cont buf =
335 case read_em [] buf of { (ls,buf') ->
336 case currentChar# buf' of
337 'B'# -> cont (ITstrict (ls, True )) (stepOverLexeme (stepOn buf'))
338 _ -> cont (ITstrict (ls, False)) (stepOverLexeme buf')
341 -- code snatched from Demand.lhs
343 case currentChar# buf of
344 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
345 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
346 'S'# -> read_em (WwStrict : acc) (stepOn buf)
347 'P'# -> read_em (WwPrim : acc) (stepOn buf)
348 'E'# -> read_em (WwEnum : acc) (stepOn buf)
349 ')'# -> (reverse acc, stepOn buf)
350 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
351 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
352 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
353 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
354 _ -> (reverse acc, buf)
356 do_unpack new_or_data wrapper_unpacks acc buf
357 = case read_em [] buf of
358 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
361 case read_em [] buf of { (cpr_inf,buf') ->
362 ASSERT ( null (tail cpr_inf) )
363 cont (ITcprinfo $ head cpr_inf) (stepOverLexeme buf')
366 -- code snatched from lex_demand above
368 case currentChar# buf of
369 '-'# -> read_em (NoCPRInfo : acc) (stepOn buf)
370 '('# -> do_unpack acc (stepOn buf)
371 ')'# -> (reverse acc, stepOn buf)
372 _ -> (reverse acc, buf)
375 = case read_em [] buf of
376 (stuff, rest) -> read_em ((CPRInfo stuff) : acc) rest
380 case currentChar# buf of
381 'C'# -> cont ITsccAllCafs (stepOverLexeme (stepOn buf))
382 other -> cont ITscc buf
385 lex_num :: (IfaceToken -> IfM a) -> (Int -> Int) -> Int# -> IfM a
386 lex_num cont minus acc# buf =
387 --trace ("lex_num: "++[C# (currentChar# buf)]) $
388 case scanNumLit (I# acc#) buf of
390 case currentChar# buf' of
392 -- this case is not optimised at all, as the
393 -- presence of floating point numbers in interface
394 -- files is not that common. (ToDo)
395 case expandWhile# is_digit (incLexeme buf') of
396 buf2 -> -- points to first non digit char
397 let l = case currentChar# buf2 of
398 'e'# -> let buf3 = incLexeme buf2 in
399 case currentChar# buf3 of
400 '-'# -> expandWhile# is_digit (incLexeme buf3)
401 _ -> expandWhile# is_digit buf3
403 in let v = readRational__ (lexemeToString l) in
404 cont (ITrational v) (stepOverLexeme l)
406 _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
409 lex_cstring cont buf =
410 case expandUntilMatch buf "\'\'" of
411 buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
412 (stepOverLexeme buf')
414 ------------------------------------------------------------------------------
417 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
419 {-# INLINE is_ctype #-}
420 #if __GLASGOW_HASKELL__ >= 303
421 is_ctype :: Word8 -> Char# -> Bool
422 is_ctype mask = \c ->
423 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
425 is_ctype :: Int -> Char# -> Bool
426 is_ctype (I# mask) = \c ->
427 let (A# ctype) = ``char_types'' :: Addr
428 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
430 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
433 is_ident = is_ctype 1
434 is_symbol = is_ctype 2
436 is_space = is_ctype 8
437 is_upper = is_ctype 16
438 is_digit = is_ctype 32
440 -----------------------------------------------------------------------------
441 -- identifiers, symbols etc.
444 case expandWhile# is_ident buf of { buf1 ->
445 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
446 let new_buf = stepOverLexeme buf'
447 lexeme = lexemeToFastString buf'
449 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
450 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
451 cont kwd_token new_buf;
453 case lookupUFM ifaceKeywordsFM lexeme of {
454 Just kwd_token -> --trace ("ifacekeywd: "++_UNPK_(lexeme)) $
455 cont kwd_token new_buf;
456 Nothing -> --trace ("id: "++_UNPK_(lexeme)) $
457 cont (mk_var_token lexeme) new_buf
461 case expandWhile# is_symbol buf of
463 | is_comment lexeme -> lex_comment cont new_buf
465 case lookupUFM haskellKeySymsFM lexeme of {
466 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
467 cont kwd_token new_buf ;
468 Nothing -> --trace ("sym: "++unpackFS lexeme) $
469 cont (mk_var_token lexeme) new_buf
471 where lexeme = lexemeToFastString buf'
472 new_buf = stepOverLexeme buf'
476 | otherwise = trundle 0
480 trundle n | n == len = True
481 | otherwise = indexFS fs n == '-' && trundle (n+1)
484 case expandWhile# is_ident buf of { buf1 ->
485 case expandWhile# (eqChar# '#'#) buf1 of { buf' ->
486 case currentChar# buf' of
487 '.'# -> lex_qid cont lexeme (stepOn new_buf) just_a_conid
491 just_a_conid = --trace ("con: "++unpackFS lexeme) $
492 cont (ITconid lexeme) new_buf
493 lexeme = lexemeToFastString buf'
494 new_buf = stepOverLexeme buf'
497 lex_qid cont mod buf just_a_conid =
498 case currentChar# buf of
499 '['# -> -- Special case for []
500 case lookAhead# buf 1# of
501 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (stepOnBy# buf 2#)
504 '('# -> -- Special case for (,,,)
505 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
506 case lookAhead# buf 1# of
507 '#'# -> case lookAhead# buf 2# of
508 ','# -> lex_ubx_tuple cont mod (stepOnBy# buf 3#)
511 ')'# -> cont (ITqconid (mod,SLIT("()"))) (stepOnBy# buf 2#)
512 ','# -> lex_tuple cont mod (stepOnBy# buf 2#) just_a_conid
515 '-'# -> case lookAhead# buf 1# of
516 '>'# -> cont (ITqconid (mod,SLIT("->"))) (stepOnBy# buf 2#)
517 _ -> lex_id3 cont mod buf just_a_conid
518 _ -> lex_id3 cont mod buf just_a_conid
520 lex_id3 cont mod buf just_a_conid
522 case expandWhile# is_symbol buf of { buf' ->
524 lexeme = lexemeToFastString buf'
525 new_buf = stepOverLexeme buf'
527 case lookupUFM haskellKeySymsFM lexeme of {
528 Just kwd_token -> just_a_conid; -- avoid M.:: etc.
529 Nothing -> cont (mk_qvar_token mod lexeme) new_buf
533 case expandWhile# is_ident buf of { buf1 ->
537 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
539 lexeme = lexemeToFastString buf'
540 new_buf = stepOverLexeme buf'
542 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
543 Just kwd_token -> just_a_conid; -- avoid M.where etc.
545 case lookupUFM ifaceKeywordsFM lexeme of { -- only for iface files
546 Just kwd_token -> just_a_conid;
547 Nothing -> cont (mk_qvar_token mod lexeme) new_buf
549 where c = currentChar# buf
552 | is_upper f = ITconid pk_str
553 -- _[A-Z] is treated as a constructor in interface files.
554 | f `eqChar#` '_'# && not (_NULL_ tl)
555 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
556 | is_ident f = ITvarid pk_str
557 | f `eqChar#` ':'# = ITconsym pk_str
558 | otherwise = ITvarsym pk_str
560 (C# f) = _HEAD_ pk_str
563 mk_qvar_token m token =
564 case mk_var_token token of
565 ITconid n -> ITqconid (m,n)
566 ITvarid n -> ITqvarid (m,n)
567 ITconsym n -> ITqconsym (m,n)
568 ITvarsym n -> ITqvarsym (m,n)
569 _ -> ITunknown (show token)
572 ----------------------------------------------------------------------------
573 Horrible stuff for dealing with M.(,,,)
576 lex_tuple cont mod buf back_off =
580 case currentChar# buf of
581 ','# -> go (n+1) (stepOn buf)
582 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n))) (stepOn buf)
585 lex_ubx_tuple cont mod buf back_off =
589 case currentChar# buf of
590 ','# -> go (n+1) (stepOn buf)
591 '#'# -> case lookAhead# buf 1# of
592 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n)))
598 -----------------------------------------------------------------------------
602 ifaceKeywordsFM :: UniqFM IfaceToken
603 ifaceKeywordsFM = listToUFM $
604 map (\ (x,y) -> (_PK_ x,y))
605 [ ("__interface", ITinterface),
606 ("__export", ITexport),
607 ("__depends", ITdepends),
608 ("__forall", ITforall),
609 ("__letrec", ITletrec),
610 ("__coerce", ITcoerce),
611 ("__inline_me", ITinlineMe),
612 ("__inline_call", ITinlineCall),
613 ("__DEFAULT", ITdefaultbranch),
615 ("__integer", ITinteger_lit),
616 ("__float", ITfloat_lit),
617 ("__rational", ITrational_lit),
618 ("__addr", ITaddr_lit),
619 ("__litlit", ITlit_lit),
620 ("__string", ITstring_lit),
626 ("__P", ITspecialise),
628 ("__u", ITunfold NoInlinePragInfo),
630 ("__ccall", ITccall (False, False, False)),
631 ("__ccall_GC", ITccall (False, False, True)),
632 ("__dyn_ccall", ITccall (True, False, False)),
633 ("__dyn_ccall_GC", ITccall (True, False, True)),
634 ("__casm", ITccall (False, True, False)),
635 ("__dyn_casm", ITccall (True, True, False)),
636 ("__casm_GC", ITccall (False, True, True)),
637 ("__dyn_casm_GC", ITccall (True, True, True)),
642 haskellKeywordsFM = listToUFM $
643 map (\ (x,y) -> (_PK_ x,y))
645 ( "class", ITclass ),
647 ( "default", ITdefault ),
648 ( "deriving", ITderiving ),
652 ( "import", ITimport ),
654 ( "infix", ITinfix ),
655 ( "infixl", ITinfixl ),
656 ( "infixr", ITinfixr ),
657 ( "instance", ITinstance ),
659 ( "module", ITmodule ),
660 ( "newtype", ITnewtype ),
666 -- These three aren't Haskell keywords at all
667 -- and 'as' is often used as a variable name
669 -- ( "qualified", ITqualified ),
670 -- ( "hiding", IThiding )
674 haskellKeySymsFM = listToUFM $
675 map (\ (x,y) -> (_PK_ x,y))
691 -----------------------------------------------------------------------------
692 doDiscard rips along really fast, looking for a '#-}',
693 indicating the end of the pragma we're skipping
696 doDiscard inStr buf =
697 case currentChar# buf of
699 case lookAhead# buf 1# of { '#'# ->
700 case lookAhead# buf 2# of { '-'# ->
701 case lookAhead# buf 3# of { '}'# ->
702 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
703 _ -> doDiscard inStr (incLexeme buf) };
704 _ -> doDiscard inStr (incLexeme buf) };
705 _ -> doDiscard inStr (incLexeme buf) }
708 odd_slashes buf flg i# =
709 case lookAhead# buf i# of
710 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
713 case lookAhead# buf (negateInt# 1#) of --backwards, actually
714 '\\'# -> -- escaping something..
715 if odd_slashes buf True (negateInt# 2#) then
716 -- odd number of slashes, " is escaped.
717 doDiscard inStr (incLexeme buf)
719 -- even number of slashes, \ is escaped.
720 doDiscard (not inStr) (incLexeme buf)
721 _ -> case inStr of -- forced to avoid build-up
722 True -> doDiscard False (incLexeme buf)
723 False -> doDiscard True (incLexeme buf)
724 _ -> doDiscard inStr (incLexeme buf)
728 -----------------------------------------------------------------------------
731 type IfM a = StringBuffer -- Input string
733 -> MaybeErr a {-error-}Message
735 returnIf :: a -> IfM a
736 returnIf a s l = Succeeded a
738 thenIf :: IfM a -> (a -> IfM b) -> IfM b
739 m `thenIf` k = \s l ->
741 Succeeded a -> k a s l
742 Failed err -> Failed err
744 getSrcLocIf :: IfM SrcLoc
745 getSrcLocIf s l = Succeeded l
748 happyError s l = Failed (ifaceParseErr s l)
752 Note that if the name of the file we're processing ends
753 with `hi-boot', we accept it on faith as having the right
754 version. This is done so that .hi-boot files that comes
755 with hsc don't have to be updated before every release,
756 *and* it allows us to share .hi-boot files with versions
757 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
759 If the version number is 0, the checking is also turned off.
760 (needed to deal with GHC.hi only!)
762 Once we can assume we're compiling with a version of ghc that
763 supports interface file checking, we can drop the special
766 checkVersion :: Maybe Integer -> IfM ()
767 checkVersion mb@(Just v) s l
768 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = Succeeded ()
769 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
770 checkVersion mb@Nothing s l
771 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
772 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
774 -----------------------------------------------------------------
776 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
778 = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
779 ptext SLIT("current input ="), text first_bit]
781 first_bit = lexemeToString (stepOnBy# s 100#)
783 ifaceVersionErr hi_vers l toks
784 = hsep [ppr l, ptext SLIT("Interface file version error;"),
785 ptext SLIT("Expected"), int opt_HiVersion,
786 ptext SLIT("found "), pp_version]
790 Nothing -> ptext SLIT("pre ghc-3.02 version")
791 Just v -> ptext SLIT("version") <+> integer v