2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
33 #include "HsVersions.h"
35 import Char ( ord, isSpace )
36 import List ( isSuffixOf )
38 import IdInfo ( InlinePragInfo(..) )
39 import Name ( isLowerISO, isUpperISO )
40 import Module ( IfaceFlavour, hiFile, hiBootFile )
41 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
42 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
43 import Demand ( Demand(..) {- instance Read -} )
44 import UniqFM ( UniqFM, listToUFM, lookupUFM)
45 import BasicTypes ( NewOrData(..) )
46 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile )
48 import Maybes ( MaybeErr(..) )
49 import ErrUtils ( Message )
57 #if __GLASGOW_HASKELL__ >= 303
63 import PrelRead ( readRational__ ) -- Glasgow non-std
66 %************************************************************************
68 \subsection{Data types}
70 %************************************************************************
72 The token data type, fairly un-interesting except from one
73 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
74 strictness, unfolding etc).
76 The Idea/Observation here is that the renamer needs to scan through
77 all of an interface file before it can continue. But only a fraction
78 of the information contained in the file turns out to be useful, so
79 delaying as much as possible of the scanning and parsing of an
80 interface file Makes Sense (Heap profiles of the compiler
81 show a reduction in heap usage by at least a factor of two,
84 Hence, the interface file lexer spots when value declarations are
85 being scanned and return the @ITidinfo@ and @ITtype@ constructors
86 for the type and any other id info for that binding (unfolding, strictness
87 etc). These constructors are applied to the result of lexing these sub-chunks.
89 The lexing of the type and id info is all done lazily, of course, so
90 the scanning (and subsequent parsing) will be done *only* on the ids the
91 renamer finds out that it is interested in. The rest will just be junked.
92 Laziness, you know it makes sense :-)
96 = ITcase -- Haskell keywords
121 | ITinterface -- GHC-extension keywords
128 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
141 | ITunfold InlinePragInfo
142 | ITstrict ([Demand], Bool)
146 | ITdotdot -- reserved symbols
159 | ITbiglam -- GHC-extension symbols
161 | ITocurly -- special symbols
172 | ITvarid FAST_STRING -- identifiers
173 | ITconid FAST_STRING
174 | ITvarsym FAST_STRING
175 | ITconsym FAST_STRING
176 | ITqvarid (FAST_STRING,FAST_STRING,IfaceFlavour)
177 | ITqconid (FAST_STRING,FAST_STRING,IfaceFlavour)
178 | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
179 | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
181 | ITpragma StringBuffer
184 | ITstring FAST_STRING
186 | ITrational Rational
188 | ITunknown String -- Used when the lexer can't make sense of it
189 | ITeof -- end of file token
190 deriving Text -- debugging
193 %************************************************************************
195 \subsection{The lexical analyser}
197 %************************************************************************
200 lexIface :: (IfaceToken -> IfM a) -> IfM a
203 -- if bufferExhausted buf then
206 -- trace ("Lexer: '"++[C# (currentChar# buf)]++"'") $
207 case currentChar# buf of
208 -- whitespace and comments, ignore.
209 ' '# -> lexIface cont (stepOn buf)
210 '\t'# -> lexIface cont (stepOn buf)
211 '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
213 -- Numbers and comments
215 case lookAhead# buf 1# of
216 -- '-'# -> lex_comment cont (stepOnBy# buf 2#)
219 then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
220 else lex_sym cont buf
222 '{'# -> -- look for "{-##" special iface pragma
223 case lookAhead# buf 1# of
224 '-'# -> case lookAhead# buf 2# of
225 '#'# -> case lookAhead# buf 3# of
228 = doDiscard False (stepOnBy# buf 4#) in
229 cont (ITpragma lexeme) buf'
230 _ -> lex_nested_comment (lexIface cont) buf
231 _ -> cont ITocurly (stepOn buf)
232 -- lex_nested_comment (lexIface cont) buf
233 _ -> cont ITocurly (stepOn buf)
235 -- special symbols ----------------------------------------------------
237 case prefixMatch (stepOn buf) "..)" of
238 Just buf' -> cont ITdotdot (stepOverLexeme buf')
240 case lookAhead# buf 1# of
241 '#'# -> cont IToubxparen (stepOnBy# buf 2#)
242 _ -> cont IToparen (stepOn buf)
243 ')'# -> cont ITcparen (stepOn buf)
244 '}'# -> cont ITccurly (stepOn buf)
245 '#'# -> case lookAhead# buf 1# of
246 ')'# -> cont ITcubxparen (stepOnBy# buf 2#)
247 _ -> lex_sym cont (incLexeme buf)
248 '['# -> cont ITobrack (stepOn buf)
249 ']'# -> cont ITcbrack (stepOn buf)
250 ','# -> cont ITcomma (stepOn buf)
251 ';'# -> cont ITsemi (stepOn buf)
253 -- strings/characters -------------------------------------------------
254 '\"'#{-"-} -> case untilEndOfString# (stepOn buf) of
256 -- the string literal does *not* include the dquotes
257 case lexemeToFastString buf' of
258 v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
261 -- untilEndOfChar# extends the current lexeme until
262 -- it hits a non-escaped single quote. The lexeme of the
263 -- StringBuffer returned does *not* include the closing quote,
264 -- hence we augment the lexeme and make sure to add the
265 -- starting quote, before `read'ing the string.
267 case untilEndOfChar# (stepOn buf) of
268 buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
269 [ (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
271 -- strictness pragma and __scc treated specially.
273 case lookAhead# buf 1# of
274 '_'# -> case lookAhead# buf 2# of
276 lex_demand cont (stepOnUntil (not . isSpace)
277 (stepOnBy# buf 3#)) -- past __S
279 case prefixMatch (stepOnBy# buf 3#) "cc" of
280 Just buf' -> lex_scc cont (stepOverLexeme buf')
281 Nothing -> lex_id cont buf
285 -- ``thingy'' form for casm
287 case lookAhead# buf 1# of
288 '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `s and go.
289 _ -> lex_sym cont (incLexeme buf) -- add ` to lexeme and assume
290 -- scanning an id of some sort.
293 if bufferExhausted (stepOn buf) then
296 trace "lexIface: misplaced NUL?" $
297 cont (ITunknown "\NUL") (stepOn buf)
299 c | is_digit c -> lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
300 | is_symbol c -> lex_sym cont buf
301 | is_upper c -> lex_con cont buf
302 | is_ident c -> lex_id cont buf
305 lex_comment cont buf =
306 -- _trace ("comment: "++[C# (currentChar# buf)]) $
307 case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
309 -------------------------------------------------------------------------------
311 lex_nested_comment cont buf =
312 case currentChar# buf of
313 '-'# -> case lookAhead# buf 1# of
314 '}'# -> cont (stepOnBy# buf 2#)
315 _ -> lex_nested_comment cont (stepOn buf)
317 '{'# -> case lookAhead# buf 1# of
318 '-'# -> lex_nested_comment
319 (lex_nested_comment cont)
321 _ -> lex_nested_comment cont (stepOn buf)
323 _ -> lex_nested_comment cont (stepOn buf)
325 -------------------------------------------------------------------------------
327 lex_demand cont buf =
328 case read_em [] buf of { (ls,buf') ->
329 case currentChar# buf' of
330 'B'# -> cont (ITstrict (ls, True )) (stepOverLexeme (stepOn buf'))
331 _ -> cont (ITstrict (ls, False)) (stepOverLexeme buf')
334 -- code snatched from Demand.lhs
336 case currentChar# buf of
337 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
338 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
339 'S'# -> read_em (WwStrict : acc) (stepOn buf)
340 'P'# -> read_em (WwPrim : acc) (stepOn buf)
341 'E'# -> read_em (WwEnum : acc) (stepOn buf)
342 ')'# -> (reverse acc, stepOn buf)
343 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
344 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
345 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
346 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
347 _ -> (reverse acc, buf)
349 do_unpack new_or_data wrapper_unpacks acc buf
350 = case read_em [] buf of
351 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
355 case currentChar# buf of
356 'C'# -> cont ITsccAllCafs (stepOverLexeme (stepOn buf))
357 other -> cont ITscc buf
360 lex_num :: (IfaceToken -> IfM a) -> (Int -> Int) -> Int# -> IfM a
361 lex_num cont minus acc# buf =
362 --trace ("lex_num: "++[C# (currentChar# buf)]) $
363 case scanNumLit (I# acc#) buf of
365 case currentChar# buf' of
367 -- this case is not optimised at all, as the
368 -- presence of floating point numbers in interface
369 -- files is not that common. (ToDo)
370 case expandWhile# is_digit (incLexeme buf') of
371 buf2 -> -- points to first non digit char
372 let l = case currentChar# buf2 of
373 'e'# -> let buf3 = incLexeme buf2 in
374 case currentChar# buf3 of
375 '-'# -> expandWhile# is_digit (incLexeme buf3)
376 _ -> expandWhile# is_digit buf3
378 in let v = readRational__ (lexemeToString l) in
379 cont (ITrational v) (stepOverLexeme l)
381 _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
384 lex_cstring cont buf =
385 case expandUntilMatch buf "\'\'" of
386 buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
387 (stepOverLexeme buf')
389 ------------------------------------------------------------------------------
392 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
394 {-# INLINE is_ctype #-}
395 #if __GLASGOW_HASKELL__ >= 303
396 is_ctype :: Word8 -> Char# -> Bool
397 is_ctype mask = \c ->
398 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
400 is_ctype :: Int -> Char# -> Bool
401 is_ctype (I# mask) = \c ->
402 let (A# ctype) = ``char_types'' :: Addr
403 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
405 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
408 is_ident = is_ctype 1
409 is_symbol = is_ctype 2
411 is_space = is_ctype 8
412 is_upper = is_ctype 16
413 is_digit = is_ctype 32
415 -----------------------------------------------------------------------------
416 -- identifiers, symbols etc.
419 case expandWhile# is_ident buf of { buf1 ->
420 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
421 let new_buf = stepOverLexeme buf'
422 lexeme = lexemeToFastString buf'
424 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
425 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
426 cont kwd_token new_buf;
428 case lookupUFM ifaceKeywordsFM lexeme of {
429 Just kwd_token -> --trace ("ifacekeywd: "++_UNPK_(lexeme)) $
430 cont kwd_token new_buf;
431 Nothing -> --trace ("id: "++_UNPK_(lexeme)) $
432 cont (mk_var_token lexeme) new_buf
436 case expandWhile# is_symbol buf of
438 | is_comment lexeme -> lex_comment cont new_buf
440 case lookupUFM haskellKeySymsFM lexeme of {
441 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
442 cont kwd_token new_buf ;
443 Nothing -> --trace ("sym: "++unpackFS lexeme) $
444 cont (mk_var_token lexeme) new_buf
446 where lexeme = lexemeToFastString buf'
447 new_buf = stepOverLexeme buf'
451 | otherwise = trundle 0
455 trundle n | n == len = True
456 | otherwise = indexFS fs n == '-' && trundle (n+1)
459 case expandWhile# is_ident buf of { buf1 ->
460 case expandWhile# (eqChar# '#'#) buf1 of { buf' ->
461 case currentChar# buf' of
463 '!'# -> munch hiBootFile
467 just_a_conid = --trace ("con: "++unpackFS lexeme) $
468 cont (ITconid lexeme) new_buf
469 lexeme = lexemeToFastString buf'
470 new_buf = stepOverLexeme buf'
471 munch hif = lex_qid cont lexeme hif (stepOn new_buf) just_a_conid
474 lex_qid cont mod hif buf just_a_conid =
475 case currentChar# buf of
476 '['# -> -- Special case for []
477 case lookAhead# buf 1# of
478 ']'# -> cont (ITqconid (mod,SLIT("[]"),hif)) (stepOnBy# buf 2#)
481 '('# -> -- Special case for (,,,)
482 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
483 case lookAhead# buf 1# of
484 '#'# -> case lookAhead# buf 2# of
485 ','# -> lex_ubx_tuple cont mod hif (stepOnBy# buf 3#)
488 ')'# -> cont (ITqconid (mod,SLIT("()"),hif)) (stepOnBy# buf 2#)
489 ','# -> lex_tuple cont mod hif (stepOnBy# buf 2#) just_a_conid
492 '-'# -> case lookAhead# buf 1# of
493 '>'# -> cont (ITqconid (mod,SLIT("->"),hif)) (stepOnBy# buf 2#)
494 _ -> lex_id3 cont mod hif buf just_a_conid
495 _ -> lex_id3 cont mod hif buf just_a_conid
497 lex_id3 cont mod hif buf just_a_conid
499 case expandWhile# is_symbol buf of { buf' ->
501 lexeme = lexemeToFastString buf'
502 new_buf = stepOverLexeme buf'
504 case lookupUFM haskellKeySymsFM lexeme of {
505 Just kwd_token -> just_a_conid; -- avoid M.:: etc.
506 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
510 case expandWhile# is_ident buf of { buf1 ->
514 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
516 lexeme = lexemeToFastString buf'
517 new_buf = stepOverLexeme buf'
519 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
520 Just kwd_token -> just_a_conid; -- avoid M.where etc.
522 case lookupUFM ifaceKeywordsFM lexeme of { -- only for iface files
523 Just kwd_token -> just_a_conid;
524 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
526 where c = currentChar# buf
529 | is_upper f = ITconid pk_str
530 -- _[A-Z] is treated as a constructor in interface files.
531 | f `eqChar#` '_'# && not (_NULL_ tl)
532 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
533 | is_ident f = ITvarid pk_str
534 | f `eqChar#` ':'# = ITconsym pk_str
535 | otherwise = ITvarsym pk_str
537 (C# f) = _HEAD_ pk_str
540 mk_qvar_token m hif token =
541 case mk_var_token token of
542 ITconid n -> ITqconid (m,n,hif)
543 ITvarid n -> ITqvarid (m,n,hif)
544 ITconsym n -> ITqconsym (m,n,hif)
545 ITvarsym n -> ITqvarsym (m,n,hif)
546 _ -> ITunknown (show token)
549 ----------------------------------------------------------------------------
550 Horrible stuff for dealing with M.(,,,)
553 lex_tuple cont mod hif buf back_off =
557 case currentChar# buf of
558 ','# -> go (n+1) (stepOn buf)
559 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n),hif)) (stepOn buf)
562 lex_ubx_tuple cont mod hif buf back_off =
566 case currentChar# buf of
567 ','# -> go (n+1) (stepOn buf)
568 '#'# -> case lookAhead# buf 1# of
569 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n), hif))
575 -----------------------------------------------------------------------------
579 ifaceKeywordsFM :: UniqFM IfaceToken
580 ifaceKeywordsFM = listToUFM $
581 map (\ (x,y) -> (_PK_ x,y))
582 [ ("__interface", ITinterface),
583 ("__export", ITexport),
584 ("__instimport", ITinstimport),
585 ("__forall", ITforall),
586 ("__letrec", ITletrec),
587 ("__coerce", ITcoerce),
588 ("__inline", ITinline),
589 ("__DEFAULT", ITdefaultbranch),
591 ("__integer", ITinteger_lit),
592 ("__float", ITfloat_lit),
593 ("__rational", ITrational_lit),
594 ("__addr", ITaddr_lit),
595 ("__litlit", ITlit_lit),
596 ("__string", ITstring_lit),
599 ("__P", ITspecialise),
601 ("__u", ITunfold NoInlinePragInfo),
602 ("__U", ITunfold IWantToBeINLINEd),
603 ("__UU", ITunfold IMustBeINLINEd),
604 ("__Unot", ITunfold IMustNotBeINLINEd),
605 ("__Ux", ITunfold IAmALoopBreaker),
607 ("__ccall", ITccall (False, False, False)),
608 ("__ccall_GC", ITccall (False, False, True)),
609 ("__dyn_ccall", ITccall (True, False, False)),
610 ("__dyn_ccall_GC", ITccall (True, False, True)),
611 ("__casm", ITccall (False, True, False)),
612 ("__dyn_casm", ITccall (True, True, False)),
613 ("__casm_GC", ITccall (False, True, True)),
614 ("__dyn_casm_GC", ITccall (True, True, True)),
619 haskellKeywordsFM = listToUFM $
620 map (\ (x,y) -> (_PK_ x,y))
622 ( "class", ITclass ),
624 ( "default", ITdefault ),
625 ( "deriving", ITderiving ),
629 ( "import", ITimport ),
631 ( "infix", ITinfix ),
632 ( "infixl", ITinfixl ),
633 ( "infixr", ITinfixr ),
634 ( "instance", ITinstance ),
636 ( "module", ITmodule ),
637 ( "newtype", ITnewtype ),
643 -- These three aren't Haskell keywords at all
644 -- and 'as' is often used as a variable name
646 -- ( "qualified", ITqualified ),
647 -- ( "hiding", IThiding )
651 haskellKeySymsFM = listToUFM $
652 map (\ (x,y) -> (_PK_ x,y))
668 -----------------------------------------------------------------------------
669 doDiscard rips along really fast, looking for a '#-}',
670 indicating the end of the pragma we're skipping
673 doDiscard inStr buf =
674 case currentChar# buf of
676 case lookAhead# buf 1# of { '#'# ->
677 case lookAhead# buf 2# of { '-'# ->
678 case lookAhead# buf 3# of { '}'# ->
679 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
680 _ -> doDiscard inStr (incLexeme buf) };
681 _ -> doDiscard inStr (incLexeme buf) };
682 _ -> doDiscard inStr (incLexeme buf) }
685 odd_slashes buf flg i# =
686 case lookAhead# buf i# of
687 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
690 case lookAhead# buf (negateInt# 1#) of --backwards, actually
691 '\\'# -> -- escaping something..
692 if odd_slashes buf True (negateInt# 2#) then
693 -- odd number of slashes, " is escaped.
694 doDiscard inStr (incLexeme buf)
696 -- even number of slashes, \ is escaped.
697 doDiscard (not inStr) (incLexeme buf)
698 _ -> case inStr of -- forced to avoid build-up
699 True -> doDiscard False (incLexeme buf)
700 False -> doDiscard True (incLexeme buf)
701 _ -> doDiscard inStr (incLexeme buf)
705 -----------------------------------------------------------------------------
708 type IfM a = StringBuffer -- Input string
710 -> MaybeErr a {-error-}Message
712 returnIf :: a -> IfM a
713 returnIf a s l = Succeeded a
715 thenIf :: IfM a -> (a -> IfM b) -> IfM b
716 m `thenIf` k = \s l ->
718 Succeeded a -> k a s l
719 Failed err -> Failed err
721 getSrcLocIf :: IfM SrcLoc
722 getSrcLocIf s l = Succeeded l
725 happyError s l = Failed (ifaceParseErr s l)
729 Note that if the name of the file we're processing ends
730 with `hi-boot', we accept it on faith as having the right
731 version. This is done so that .hi-boot files that comes
732 with hsc don't have to be updated before every release,
733 *and* it allows us to share .hi-boot files with versions
734 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
736 If the version number is 0, the checking is also turned off.
737 (needed to deal with GHC.hi only!)
739 Once we can assume we're compiling with a version of ghc that
740 supports interface file checking, we can drop the special
743 checkVersion :: Maybe Integer -> IfM ()
744 checkVersion mb@(Just v) s l
745 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = Succeeded ()
746 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
747 checkVersion mb@Nothing s l
748 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
749 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
751 -----------------------------------------------------------------
753 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
755 = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
756 ptext SLIT("current input ="), text first_bit]
758 first_bit = lexemeToString (stepOnBy# s 100#)
760 ifaceVersionErr hi_vers l toks
761 = hsep [ppr l, ptext SLIT("Interface file version error;"),
762 ptext SLIT("Expected"), int opt_HiVersion,
763 ptext SLIT("found "), pp_version]
767 Nothing -> ptext SLIT("pre ghc-3.02 version")
768 Just v -> ptext SLIT("version") <+> integer v