2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
33 #include "HsVersions.h"
35 import Char ( ord, isSpace )
36 import List ( isSuffixOf )
38 import IdInfo ( InlinePragInfo(..) )
39 import Name ( isLowerISO, isUpperISO )
40 import Module ( IfaceFlavour, hiFile, hiBootFile )
41 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
42 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
43 import Demand ( Demand(..) {- instance Read -} )
44 import UniqFM ( UniqFM, listToUFM, lookupUFM)
45 import BasicTypes ( NewOrData(..) )
46 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile )
48 import Maybes ( MaybeErr(..) )
49 import ErrUtils ( Message )
57 #if __GLASGOW_HASKELL__ >= 303
63 import PrelRead ( readRational__ ) -- Glasgow non-std
66 %************************************************************************
68 \subsection{Data types}
70 %************************************************************************
72 The token data type, fairly un-interesting except from one
73 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
74 strictness, unfolding etc).
76 The Idea/Observation here is that the renamer needs to scan through
77 all of an interface file before it can continue. But only a fraction
78 of the information contained in the file turns out to be useful, so
79 delaying as much as possible of the scanning and parsing of an
80 interface file Makes Sense (Heap profiles of the compiler
81 show a reduction in heap usage by at least a factor of two,
84 Hence, the interface file lexer spots when value declarations are
85 being scanned and return the @ITidinfo@ and @ITtype@ constructors
86 for the type and any other id info for that binding (unfolding, strictness
87 etc). These constructors are applied to the result of lexing these sub-chunks.
89 The lexing of the type and id info is all done lazily, of course, so
90 the scanning (and subsequent parsing) will be done *only* on the ids the
91 renamer finds out that it is interested in. The rest will just be junked.
92 Laziness, you know it makes sense :-)
96 = ITcase -- Haskell keywords
121 | ITinterface -- GHC-extension keywords
128 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
141 | ITunfold InlinePragInfo
142 | ITstrict ([Demand], Bool)
147 | ITdotdot -- reserved symbols
160 | ITbiglam -- GHC-extension symbols
162 | ITocurly -- special symbols
173 | ITvarid FAST_STRING -- identifiers
174 | ITconid FAST_STRING
175 | ITvarsym FAST_STRING
176 | ITconsym FAST_STRING
177 | ITqvarid (FAST_STRING,FAST_STRING,IfaceFlavour)
178 | ITqconid (FAST_STRING,FAST_STRING,IfaceFlavour)
179 | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
180 | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
182 | ITpragma StringBuffer
185 | ITstring FAST_STRING
187 | ITrational Rational
189 | ITunknown String -- Used when the lexer can't make sense of it
190 | ITeof -- end of file token
191 deriving Text -- debugging
194 %************************************************************************
196 \subsection{The lexical analyser}
198 %************************************************************************
201 lexIface :: (IfaceToken -> IfM a) -> IfM a
204 -- if bufferExhausted buf then
207 -- trace ("Lexer: '"++[C# (currentChar# buf)]++"'") $
208 case currentChar# buf of
209 -- whitespace and comments, ignore.
210 ' '# -> lexIface cont (stepOn buf)
211 '\t'# -> lexIface cont (stepOn buf)
212 '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
214 -- Numbers and comments
216 case lookAhead# buf 1# of
217 -- '-'# -> lex_comment cont (stepOnBy# buf 2#)
220 then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
221 else lex_sym cont buf
223 '{'# -> -- look for "{-##" special iface pragma
224 case lookAhead# buf 1# of
225 '-'# -> case lookAhead# buf 2# of
226 '#'# -> case lookAhead# buf 3# of
229 = doDiscard False (stepOnBy# buf 4#) in
230 cont (ITpragma lexeme) buf'
231 _ -> lex_nested_comment (lexIface cont) buf
232 _ -> cont ITocurly (stepOn buf)
233 -- lex_nested_comment (lexIface cont) buf
234 _ -> cont ITocurly (stepOn buf)
236 -- special symbols ----------------------------------------------------
238 case prefixMatch (stepOn buf) "..)" of
239 Just buf' -> cont ITdotdot (stepOverLexeme buf')
241 case lookAhead# buf 1# of
242 '#'# -> cont IToubxparen (stepOnBy# buf 2#)
243 _ -> cont IToparen (stepOn buf)
244 ')'# -> cont ITcparen (stepOn buf)
245 '}'# -> cont ITccurly (stepOn buf)
246 '#'# -> case lookAhead# buf 1# of
247 ')'# -> cont ITcubxparen (stepOnBy# buf 2#)
248 _ -> lex_sym cont (incLexeme buf)
249 '['# -> cont ITobrack (stepOn buf)
250 ']'# -> cont ITcbrack (stepOn buf)
251 ','# -> cont ITcomma (stepOn buf)
252 ';'# -> cont ITsemi (stepOn buf)
254 -- strings/characters -------------------------------------------------
255 '\"'#{-"-} -> case untilEndOfString# (stepOn buf) of
257 -- the string literal does *not* include the dquotes
258 case lexemeToFastString buf' of
259 v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
262 -- untilEndOfChar# extends the current lexeme until
263 -- it hits a non-escaped single quote. The lexeme of the
264 -- StringBuffer returned does *not* include the closing quote,
265 -- hence we augment the lexeme and make sure to add the
266 -- starting quote, before `read'ing the string.
268 case untilEndOfChar# (stepOn buf) of
269 buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
270 [ (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
272 -- strictness pragma and __scc treated specially.
274 case lookAhead# buf 1# of
275 '_'# -> case lookAhead# buf 2# of
277 lex_demand cont (stepOnUntil (not . isSpace)
278 (stepOnBy# buf 3#)) -- past __S
280 case prefixMatch (stepOnBy# buf 3#) "cc" of
281 Just buf' -> lex_scc cont (stepOverLexeme buf')
282 Nothing -> lex_id cont buf
286 -- ``thingy'' form for casm
288 case lookAhead# buf 1# of
289 '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `s and go.
290 _ -> lex_sym cont (incLexeme buf) -- add ` to lexeme and assume
291 -- scanning an id of some sort.
294 if bufferExhausted (stepOn buf) then
297 trace "lexIface: misplaced NUL?" $
298 cont (ITunknown "\NUL") (stepOn buf)
300 c | is_digit c -> lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
301 | is_symbol c -> lex_sym cont buf
302 | is_upper c -> lex_con cont buf
303 | is_ident c -> lex_id cont buf
306 lex_comment cont buf =
307 -- _trace ("comment: "++[C# (currentChar# buf)]) $
308 case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
310 -------------------------------------------------------------------------------
312 lex_nested_comment cont buf =
313 case currentChar# buf of
314 '-'# -> case lookAhead# buf 1# of
315 '}'# -> cont (stepOnBy# buf 2#)
316 _ -> lex_nested_comment cont (stepOn buf)
318 '{'# -> case lookAhead# buf 1# of
319 '-'# -> lex_nested_comment
320 (lex_nested_comment cont)
322 _ -> lex_nested_comment cont (stepOn buf)
324 _ -> lex_nested_comment cont (stepOn buf)
326 -------------------------------------------------------------------------------
328 lex_demand cont buf =
329 case read_em [] buf of { (ls,buf') ->
330 case currentChar# buf' of
331 'B'# -> cont (ITstrict (ls, True )) (stepOverLexeme (stepOn buf'))
332 _ -> cont (ITstrict (ls, False)) (stepOverLexeme buf')
335 -- code snatched from Demand.lhs
337 case currentChar# buf of
338 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
339 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
340 'S'# -> read_em (WwStrict : acc) (stepOn buf)
341 'P'# -> read_em (WwPrim : acc) (stepOn buf)
342 'E'# -> read_em (WwEnum : acc) (stepOn buf)
343 ')'# -> (reverse acc, stepOn buf)
344 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
345 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
346 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
347 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
348 _ -> (reverse acc, buf)
350 do_unpack new_or_data wrapper_unpacks acc buf
351 = case read_em [] buf of
352 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
356 case currentChar# buf of
357 'C'# -> cont ITsccAllCafs (stepOverLexeme (stepOn buf))
358 'D'# -> cont ITsccAllDicts (stepOverLexeme (stepOn buf))
359 other -> cont ITscc buf
362 lex_num :: (IfaceToken -> IfM a) -> (Int -> Int) -> Int# -> IfM a
363 lex_num cont minus acc# buf =
364 --trace ("lex_num: "++[C# (currentChar# buf)]) $
365 case scanNumLit (I# acc#) buf of
367 case currentChar# buf' of
369 -- this case is not optimised at all, as the
370 -- presence of floating point numbers in interface
371 -- files is not that common. (ToDo)
372 case expandWhile# is_digit (incLexeme buf') of
373 buf2 -> -- points to first non digit char
374 let l = case currentChar# buf2 of
375 'e'# -> let buf3 = incLexeme buf2 in
376 case currentChar# buf3 of
377 '-'# -> expandWhile# is_digit (incLexeme buf3)
378 _ -> expandWhile# is_digit buf3
380 in let v = readRational__ (lexemeToString l) in
381 cont (ITrational v) (stepOverLexeme l)
383 _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
386 lex_cstring cont buf =
387 case expandUntilMatch buf "\'\'" of
388 buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
389 (stepOverLexeme buf')
391 ------------------------------------------------------------------------------
394 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
396 {-# INLINE is_ctype #-}
397 #if __GLASGOW_HASKELL__ >= 303
398 is_ctype :: Word8 -> Char# -> Bool
399 is_ctype mask = \c ->
400 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
402 is_ctype :: Int -> Char# -> Bool
403 is_ctype (I# mask) = \c ->
404 let (A# ctype) = ``char_types'' :: Addr
405 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
407 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
410 is_ident = is_ctype 1
411 is_symbol = is_ctype 2
413 is_space = is_ctype 8
414 is_upper = is_ctype 16
415 is_digit = is_ctype 32
417 -----------------------------------------------------------------------------
418 -- identifiers, symbols etc.
421 case expandWhile# is_ident buf of { buf1 ->
422 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
423 let new_buf = stepOverLexeme buf'
424 lexeme = lexemeToFastString buf'
426 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
427 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
428 cont kwd_token new_buf;
430 case lookupUFM ifaceKeywordsFM lexeme of {
431 Just kwd_token -> --trace ("ifacekeywd: "++_UNPK_(lexeme)) $
432 cont kwd_token new_buf;
433 Nothing -> --trace ("id: "++_UNPK_(lexeme)) $
434 cont (mk_var_token lexeme) new_buf
438 case expandWhile# is_symbol buf of
440 | is_comment lexeme -> lex_comment cont new_buf
442 case lookupUFM haskellKeySymsFM lexeme of {
443 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
444 cont kwd_token new_buf ;
445 Nothing -> --trace ("sym: "++unpackFS lexeme) $
446 cont (mk_var_token lexeme) new_buf
448 where lexeme = lexemeToFastString buf'
449 new_buf = stepOverLexeme buf'
453 | otherwise = trundle 0
457 trundle n | n == len = True
458 | otherwise = indexFS fs n == '-' && trundle (n+1)
461 case expandWhile# is_ident buf of { buf1 ->
462 case expandWhile# (eqChar# '#'#) buf1 of { buf' ->
463 case currentChar# buf' of
465 '!'# -> munch hiBootFile
469 just_a_conid = --trace ("con: "++unpackFS lexeme) $
470 cont (ITconid lexeme) new_buf
471 lexeme = lexemeToFastString buf'
472 new_buf = stepOverLexeme buf'
473 munch hif = lex_qid cont lexeme hif (stepOn new_buf) just_a_conid
476 lex_qid cont mod hif buf just_a_conid =
477 case currentChar# buf of
478 '['# -> -- Special case for []
479 case lookAhead# buf 1# of
480 ']'# -> cont (ITqconid (mod,SLIT("[]"),hif)) (stepOnBy# buf 2#)
483 '('# -> -- Special case for (,,,)
484 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
485 case lookAhead# buf 1# of
486 '#'# -> case lookAhead# buf 2# of
487 ','# -> lex_ubx_tuple cont mod hif (stepOnBy# buf 3#)
490 ')'# -> cont (ITqconid (mod,SLIT("()"),hif)) (stepOnBy# buf 2#)
491 ','# -> lex_tuple cont mod hif (stepOnBy# buf 2#) just_a_conid
494 '-'# -> case lookAhead# buf 1# of
495 '>'# -> cont (ITqconid (mod,SLIT("->"),hif)) (stepOnBy# buf 2#)
496 _ -> lex_id3 cont mod hif buf just_a_conid
497 _ -> lex_id3 cont mod hif buf just_a_conid
499 lex_id3 cont mod hif buf just_a_conid
501 case expandWhile# is_symbol buf of { buf' ->
503 lexeme = lexemeToFastString buf'
504 new_buf = stepOverLexeme buf'
506 case lookupUFM haskellKeySymsFM lexeme of {
507 Just kwd_token -> just_a_conid; -- avoid M.:: etc.
508 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
512 case expandWhile# is_ident buf of { buf1 ->
516 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
518 lexeme = lexemeToFastString buf'
519 new_buf = stepOverLexeme buf'
521 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
522 Just kwd_token -> just_a_conid; -- avoid M.where etc.
524 case lookupUFM ifaceKeywordsFM lexeme of { -- only for iface files
525 Just kwd_token -> just_a_conid;
526 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
528 where c = currentChar# buf
531 | is_upper f = ITconid pk_str
532 -- _[A-Z] is treated as a constructor in interface files.
533 | f `eqChar#` '_'# && not (_NULL_ tl)
534 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
535 | is_ident f = ITvarid pk_str
536 | f `eqChar#` ':'# = ITconsym pk_str
537 | otherwise = ITvarsym pk_str
539 (C# f) = _HEAD_ pk_str
542 mk_qvar_token m hif token =
543 case mk_var_token token of
544 ITconid n -> ITqconid (m,n,hif)
545 ITvarid n -> ITqvarid (m,n,hif)
546 ITconsym n -> ITqconsym (m,n,hif)
547 ITvarsym n -> ITqvarsym (m,n,hif)
548 _ -> ITunknown (show token)
551 ----------------------------------------------------------------------------
552 Horrible stuff for dealing with M.(,,,)
555 lex_tuple cont mod hif buf back_off =
559 case currentChar# buf of
560 ','# -> go (n+1) (stepOn buf)
561 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n),hif)) (stepOn buf)
564 lex_ubx_tuple cont mod hif buf back_off =
568 case currentChar# buf of
569 ','# -> go (n+1) (stepOn buf)
570 '#'# -> case lookAhead# buf 1# of
571 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n), hif))
577 -----------------------------------------------------------------------------
581 ifaceKeywordsFM :: UniqFM IfaceToken
582 ifaceKeywordsFM = listToUFM $
583 map (\ (x,y) -> (_PK_ x,y))
584 [ ("__interface", ITinterface),
585 ("__export", ITexport),
586 ("__instimport", ITinstimport),
587 ("__forall", ITforall),
588 ("__letrec", ITletrec),
589 ("__coerce", ITcoerce),
590 ("__inline", ITinline),
591 ("__DEFAULT", ITdefaultbranch),
593 ("__integer", ITinteger_lit),
594 ("__float", ITfloat_lit),
595 ("__rational", ITrational_lit),
596 ("__addr", ITaddr_lit),
597 ("__litlit", ITlit_lit),
598 ("__string", ITstring_lit),
601 ("__P", ITspecialise),
603 ("__u", ITunfold NoInlinePragInfo),
604 ("__U", ITunfold IWantToBeINLINEd),
605 ("__UU", ITunfold IMustBeINLINEd),
606 ("__Unot", ITunfold IMustNotBeINLINEd),
607 ("__Ux", ITunfold IAmALoopBreaker),
609 ("__ccall", ITccall (False, False, False)),
610 ("__ccall_GC", ITccall (False, False, True)),
611 ("__dyn_ccall", ITccall (True, False, False)),
612 ("__dyn_ccall_GC", ITccall (True, False, True)),
613 ("__casm", ITccall (False, True, False)),
614 ("__dyn_casm", ITccall (True, True, False)),
615 ("__casm_GC", ITccall (False, True, True)),
616 ("__dyn_casm_GC", ITccall (True, True, True)),
621 haskellKeywordsFM = listToUFM $
622 map (\ (x,y) -> (_PK_ x,y))
624 ( "class", ITclass ),
626 ( "default", ITdefault ),
627 ( "deriving", ITderiving ),
631 ( "import", ITimport ),
633 ( "infix", ITinfix ),
634 ( "infixl", ITinfixl ),
635 ( "infixr", ITinfixr ),
636 ( "instance", ITinstance ),
638 ( "module", ITmodule ),
639 ( "newtype", ITnewtype ),
645 -- These three aren't Haskell keywords at all
646 -- and 'as' is often used as a variable name
648 -- ( "qualified", ITqualified ),
649 -- ( "hiding", IThiding )
653 haskellKeySymsFM = listToUFM $
654 map (\ (x,y) -> (_PK_ x,y))
670 -----------------------------------------------------------------------------
671 doDiscard rips along really fast, looking for a '#-}',
672 indicating the end of the pragma we're skipping
675 doDiscard inStr buf =
676 case currentChar# buf of
678 case lookAhead# buf 1# of { '#'# ->
679 case lookAhead# buf 2# of { '-'# ->
680 case lookAhead# buf 3# of { '}'# ->
681 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
682 _ -> doDiscard inStr (incLexeme buf) };
683 _ -> doDiscard inStr (incLexeme buf) };
684 _ -> doDiscard inStr (incLexeme buf) }
687 odd_slashes buf flg i# =
688 case lookAhead# buf i# of
689 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
692 case lookAhead# buf (negateInt# 1#) of --backwards, actually
693 '\\'# -> -- escaping something..
694 if odd_slashes buf True (negateInt# 2#) then
695 -- odd number of slashes, " is escaped.
696 doDiscard inStr (incLexeme buf)
698 -- even number of slashes, \ is escaped.
699 doDiscard (not inStr) (incLexeme buf)
700 _ -> case inStr of -- forced to avoid build-up
701 True -> doDiscard False (incLexeme buf)
702 False -> doDiscard True (incLexeme buf)
703 _ -> doDiscard inStr (incLexeme buf)
707 -----------------------------------------------------------------------------
710 type IfM a = StringBuffer -- Input string
712 -> MaybeErr a {-error-}Message
714 returnIf :: a -> IfM a
715 returnIf a s l = Succeeded a
717 thenIf :: IfM a -> (a -> IfM b) -> IfM b
718 m `thenIf` k = \s l ->
720 Succeeded a -> k a s l
721 Failed err -> Failed err
723 getSrcLocIf :: IfM SrcLoc
724 getSrcLocIf s l = Succeeded l
727 happyError s l = Failed (ifaceParseErr s l)
731 Note that if the name of the file we're processing ends
732 with `hi-boot', we accept it on faith as having the right
733 version. This is done so that .hi-boot files that comes
734 with hsc don't have to be updated before every release,
735 *and* it allows us to share .hi-boot files with versions
736 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
738 If the version number is 0, the checking is also turned off.
739 (needed to deal with GHC.hi only!)
741 Once we can assume we're compiling with a version of ghc that
742 supports interface file checking, we can drop the special
745 checkVersion :: Maybe Integer -> IfM ()
746 checkVersion mb@(Just v) s l
747 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = Succeeded ()
748 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
749 checkVersion mb@Nothing s l
750 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
751 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
753 -----------------------------------------------------------------
755 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
757 = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
758 ptext SLIT("current input ="), text first_bit]
760 first_bit = lexemeToString (stepOnBy# s 100#)
762 ifaceVersionErr hi_vers l toks
763 = hsep [ppr l, ptext SLIT("Interface file version error;"),
764 ptext SLIT("Expected"), int opt_HiVersion,
765 ptext SLIT("found "), pp_version]
769 Nothing -> ptext SLIT("pre ghc-3.02 version")
770 Just v -> ptext SLIT("version") <+> integer v