2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
33 #include "HsVersions.h"
35 import Char ( ord, isSpace )
36 import List ( isSuffixOf )
38 import IdInfo ( InlinePragInfo(..) )
39 import Name ( isLowerISO, isUpperISO )
40 import OccName ( IfaceFlavour, hiFile, hiBootFile )
41 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
42 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
43 import Demand ( Demand(..) {- instance Read -} )
44 import UniqFM ( UniqFM, listToUFM, lookupUFM)
45 import BasicTypes ( NewOrData(..) )
46 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile )
48 import Maybes ( MaybeErr(..) )
49 import ErrUtils ( Message )
57 #if __GLASGOW_HASKELL__ >= 303
64 import PrelRead ( readRational__ ) -- Glasgow non-std
67 %************************************************************************
69 \subsection{Data types}
71 %************************************************************************
73 The token data type, fairly un-interesting except from one
74 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
75 strictness, unfolding etc).
77 The Idea/Observation here is that the renamer needs to scan through
78 all of an interface file before it can continue. But only a fraction
79 of the information contained in the file turns out to be useful, so
80 delaying as much as possible of the scanning and parsing of an
81 interface file Makes Sense (Heap profiles of the compiler
82 show a reduction in heap usage by at least a factor of two,
85 Hence, the interface file lexer spots when value declarations are
86 being scanned and return the @ITidinfo@ and @ITtype@ constructors
87 for the type and any other id info for that binding (unfolding, strictness
88 etc). These constructors are applied to the result of lexing these sub-chunks.
90 The lexing of the type and id info is all done lazily, of course, so
91 the scanning (and subsequent parsing) will be done *only* on the ids the
92 renamer finds out that it is interested in. The rest will just be junked.
93 Laziness, you know it makes sense :-)
97 = ITcase -- Haskell keywords
122 | ITinterface -- GHC-extension keywords
129 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
142 | ITunfold InlinePragInfo
143 | ITstrict ([Demand], Bool)
148 | ITdotdot -- reserved symbols
161 | ITbiglam -- GHC-extension symbols
163 | ITocurly -- special symbols
174 | ITvarid FAST_STRING -- identifiers
175 | ITconid FAST_STRING
176 | ITvarsym FAST_STRING
177 | ITconsym FAST_STRING
178 | ITqvarid (FAST_STRING,FAST_STRING,IfaceFlavour)
179 | ITqconid (FAST_STRING,FAST_STRING,IfaceFlavour)
180 | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
181 | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
183 | ITpragma StringBuffer
186 | ITstring FAST_STRING
188 | ITrational Rational
190 | ITunknown String -- Used when the lexer can't make sense of it
191 | ITeof -- end of file token
192 deriving Text -- debugging
195 %************************************************************************
197 \subsection{The lexical analyser}
199 %************************************************************************
202 lexIface :: (IfaceToken -> IfM a) -> IfM a
205 -- if bufferExhausted buf then
208 -- trace ("Lexer: '"++[C# (currentChar# buf)]++"'") $
209 case currentChar# buf of
210 -- whitespace and comments, ignore.
211 ' '# -> lexIface cont (stepOn buf)
212 '\t'# -> lexIface cont (stepOn buf)
213 '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
215 -- Numbers and comments
217 case lookAhead# buf 1# of
218 -- '-'# -> lex_comment cont (stepOnBy# buf 2#)
221 then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
222 else lex_sym cont buf
224 '{'# -> -- look for "{-##" special iface pragma
225 case lookAhead# buf 1# of
226 '-'# -> case lookAhead# buf 2# of
227 '#'# -> case lookAhead# buf 3# of
230 = doDiscard False (stepOnBy# buf 4#) in
231 cont (ITpragma lexeme) buf'
232 _ -> lex_nested_comment (lexIface cont) buf
233 _ -> cont ITocurly (stepOn buf)
234 -- lex_nested_comment (lexIface cont) buf
235 _ -> cont ITocurly (stepOn buf)
237 -- special symbols ----------------------------------------------------
239 case prefixMatch (stepOn buf) "..)" of
240 Just buf' -> cont ITdotdot (stepOverLexeme buf')
242 case lookAhead# buf 1# of
243 '#'# -> cont IToubxparen (stepOnBy# buf 2#)
244 _ -> cont IToparen (stepOn buf)
245 ')'# -> cont ITcparen (stepOn buf)
246 '}'# -> cont ITccurly (stepOn buf)
247 '#'# -> case lookAhead# buf 1# of
248 ')'# -> cont ITcubxparen (stepOnBy# buf 2#)
249 _ -> lex_sym cont (incLexeme buf)
250 '['# -> cont ITobrack (stepOn buf)
251 ']'# -> cont ITcbrack (stepOn buf)
252 ','# -> cont ITcomma (stepOn buf)
253 ';'# -> cont ITsemi (stepOn buf)
255 -- strings/characters -------------------------------------------------
256 '\"'#{-"-} -> case untilEndOfString# (stepOn buf) of
258 -- the string literal does *not* include the dquotes
259 case lexemeToFastString buf' of
260 v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
263 -- untilEndOfChar# extends the current lexeme until
264 -- it hits a non-escaped single quote. The lexeme of the
265 -- StringBuffer returned does *not* include the closing quote,
266 -- hence we augment the lexeme and make sure to add the
267 -- starting quote, before `read'ing the string.
269 case untilEndOfChar# (stepOn buf) of
270 buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
271 [ (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
273 -- strictness pragma and __scc treated specially.
275 case lookAhead# buf 1# of
276 '_'# -> case lookAhead# buf 2# of
278 lex_demand cont (stepOnUntil (not . isSpace)
279 (stepOnBy# buf 3#)) -- past __S
281 case prefixMatch (stepOnBy# buf 3#) "cc" of
282 Just buf' -> lex_scc cont (stepOverLexeme buf')
283 Nothing -> lex_id cont buf
287 -- ``thingy'' form for casm
289 case lookAhead# buf 1# of
290 '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `s and go.
291 _ -> lex_sym cont (incLexeme buf) -- add ` to lexeme and assume
292 -- scanning an id of some sort.
295 if bufferExhausted (stepOn buf) then
298 trace "lexIface: misplaced NUL?" $
299 cont (ITunknown "\NUL") (stepOn buf)
301 c | is_digit c -> lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
302 | is_symbol c -> lex_sym cont buf
303 | is_upper c -> lex_con cont buf
304 | is_ident c -> lex_id cont buf
307 lex_comment cont buf =
308 -- _trace ("comment: "++[C# (currentChar# buf)]) $
309 case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
311 -------------------------------------------------------------------------------
313 lex_nested_comment cont buf =
314 case currentChar# buf of
315 '-'# -> case lookAhead# buf 1# of
316 '}'# -> cont (stepOnBy# buf 2#)
317 _ -> lex_nested_comment cont (stepOn buf)
319 '{'# -> case lookAhead# buf 1# of
320 '-'# -> lex_nested_comment
321 (lex_nested_comment cont)
323 _ -> lex_nested_comment cont (stepOn buf)
325 _ -> lex_nested_comment cont (stepOn buf)
327 -------------------------------------------------------------------------------
329 lex_demand cont buf =
330 case read_em [] buf of { (ls,buf') ->
331 case currentChar# buf' of
332 'B'# -> cont (ITstrict (ls, True )) (stepOverLexeme (stepOn buf'))
333 _ -> cont (ITstrict (ls, False)) (stepOverLexeme buf')
336 -- code snatched from Demand.lhs
338 case currentChar# buf of
339 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
340 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
341 'S'# -> read_em (WwStrict : acc) (stepOn buf)
342 'P'# -> read_em (WwPrim : acc) (stepOn buf)
343 'E'# -> read_em (WwEnum : acc) (stepOn buf)
344 ')'# -> (reverse acc, stepOn buf)
345 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
346 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
347 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
348 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
349 _ -> (reverse acc, buf)
351 do_unpack new_or_data wrapper_unpacks acc buf
352 = case read_em [] buf of
353 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
357 case currentChar# buf of
358 'C'# -> cont ITsccAllCafs (stepOverLexeme (stepOn buf))
359 'D'# -> cont ITsccAllDicts (stepOverLexeme (stepOn buf))
360 other -> cont ITscc buf
363 lex_num :: (IfaceToken -> IfM a) -> (Int -> Int) -> Int# -> IfM a
364 lex_num cont minus acc# buf =
365 --trace ("lex_num: "++[C# (currentChar# buf)]) $
366 case scanNumLit (I# acc#) buf of
368 case currentChar# buf' of
370 -- this case is not optimised at all, as the
371 -- presence of floating point numbers in interface
372 -- files is not that common. (ToDo)
373 case expandWhile# is_digit (incLexeme buf') of
374 buf2 -> -- points to first non digit char
375 let l = case currentChar# buf2 of
376 'e'# -> let buf3 = incLexeme buf2 in
377 case currentChar# buf3 of
378 '-'# -> expandWhile# is_digit (incLexeme buf3)
379 _ -> expandWhile# is_digit buf3
381 in let v = readRational__ (lexemeToString l) in
382 cont (ITrational v) (stepOverLexeme l)
384 _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
387 lex_cstring cont buf =
388 case expandUntilMatch buf "\'\'" of
389 buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
390 (stepOverLexeme buf')
392 ------------------------------------------------------------------------------
395 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
397 {-# INLINE is_ctype #-}
398 #if __GLASGOW_HASKELL__ >= 303
399 is_ctype :: Word8 -> Char# -> Bool
400 is_ctype mask = \c ->
401 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
403 is_ctype :: Int -> Char# -> Bool
404 is_ctype (I# mask) = \c ->
405 let (A# ctype) = ``char_types'' :: Addr
406 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
408 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
411 is_ident = is_ctype 1
412 is_symbol = is_ctype 2
414 is_space = is_ctype 8
415 is_upper = is_ctype 16
416 is_digit = is_ctype 32
418 -----------------------------------------------------------------------------
419 -- identifiers, symbols etc.
422 case expandWhile# is_ident buf of { buf1 ->
423 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
424 let new_buf = stepOverLexeme buf'
425 lexeme = lexemeToFastString buf'
427 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
428 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
429 cont kwd_token new_buf;
431 case lookupUFM ifaceKeywordsFM lexeme of {
432 Just kwd_token -> --trace ("ifacekeywd: "++_UNPK_(lexeme)) $
433 cont kwd_token new_buf;
434 Nothing -> --trace ("id: "++_UNPK_(lexeme)) $
435 cont (mk_var_token lexeme) new_buf
439 case expandWhile# is_symbol buf of
441 | is_comment lexeme -> lex_comment cont new_buf
443 case lookupUFM haskellKeySymsFM lexeme of {
444 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
445 cont kwd_token new_buf ;
446 Nothing -> --trace ("sym: "++unpackFS lexeme) $
447 cont (mk_var_token lexeme) new_buf
449 where lexeme = lexemeToFastString buf'
450 new_buf = stepOverLexeme buf'
454 | otherwise = trundle 0
458 trundle n | n == len = True
459 | otherwise = indexFS fs n == '-' && trundle (n+1)
462 case expandWhile# is_ident buf of { buf1 ->
463 case expandWhile# (eqChar# '#'#) buf1 of { buf' ->
464 case currentChar# buf' of
466 '!'# -> munch hiBootFile
470 just_a_conid = --trace ("con: "++unpackFS lexeme) $
471 cont (ITconid lexeme) new_buf
472 lexeme = lexemeToFastString buf'
473 new_buf = stepOverLexeme buf'
474 munch hif = lex_qid cont lexeme hif (stepOn new_buf) just_a_conid
477 lex_qid cont mod hif buf just_a_conid =
478 case currentChar# buf of
479 '['# -> -- Special case for []
480 case lookAhead# buf 1# of
481 ']'# -> cont (ITqconid (mod,SLIT("[]"),hif)) (stepOnBy# buf 2#)
484 '('# -> -- Special case for (,,,)
485 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
486 case lookAhead# buf 1# of
487 '#'# -> case lookAhead# buf 2# of
488 ','# -> lex_ubx_tuple cont mod hif (stepOnBy# buf 3#)
491 ')'# -> cont (ITqconid (mod,SLIT("()"),hif)) (stepOnBy# buf 2#)
492 ','# -> lex_tuple cont mod hif (stepOnBy# buf 2#) just_a_conid
495 '-'# -> case lookAhead# buf 1# of
496 '>'# -> cont (ITqconid (mod,SLIT("->"),hif)) (stepOnBy# buf 2#)
497 _ -> lex_id3 cont mod hif buf just_a_conid
498 _ -> lex_id3 cont mod hif buf just_a_conid
500 lex_id3 cont mod hif buf just_a_conid
502 case expandWhile# is_symbol buf of { buf' ->
504 lexeme = lexemeToFastString buf'
505 new_buf = stepOverLexeme buf'
507 case lookupUFM haskellKeySymsFM lexeme of {
508 Just kwd_token -> just_a_conid; -- avoid M.:: etc.
509 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
513 case expandWhile# is_ident buf of { buf1 ->
517 case expandWhile# (eqChar# '#'#) buf1 of { buf' -> -- only if GHC extns on
519 lexeme = lexemeToFastString buf'
520 new_buf = stepOverLexeme buf'
522 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
523 Just kwd_token -> just_a_conid; -- avoid M.where etc.
525 case lookupUFM ifaceKeywordsFM lexeme of { -- only for iface files
526 Just kwd_token -> just_a_conid;
527 Nothing -> cont (mk_qvar_token mod hif lexeme) new_buf
529 where c = currentChar# buf
532 | is_upper f = ITconid pk_str
533 -- _[A-Z] is treated as a constructor in interface files.
534 | f `eqChar#` '_'# && not (_NULL_ tl)
535 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
536 | is_ident f = ITvarid pk_str
537 | f `eqChar#` ':'# = ITconsym pk_str
538 | otherwise = ITvarsym pk_str
540 (C# f) = _HEAD_ pk_str
543 mk_qvar_token m hif token =
544 case mk_var_token token of
545 ITconid n -> ITqconid (m,n,hif)
546 ITvarid n -> ITqvarid (m,n,hif)
547 ITconsym n -> ITqconsym (m,n,hif)
548 ITvarsym n -> ITqvarsym (m,n,hif)
549 _ -> ITunknown (show token)
552 ----------------------------------------------------------------------------
553 Horrible stuff for dealing with M.(,,,)
556 lex_tuple cont mod hif buf back_off =
560 case currentChar# buf of
561 ','# -> go (n+1) (stepOn buf)
562 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n),hif)) (stepOn buf)
565 lex_ubx_tuple cont mod hif buf back_off =
569 case currentChar# buf of
570 ','# -> go (n+1) (stepOn buf)
571 '#'# -> case lookAhead# buf 1# of
572 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n), hif))
578 -----------------------------------------------------------------------------
582 ifaceKeywordsFM :: UniqFM IfaceToken
583 ifaceKeywordsFM = listToUFM $
584 map (\ (x,y) -> (_PK_ x,y))
585 [ ("__interface", ITinterface),
586 ("__export", ITexport),
587 ("__instimport", ITinstimport),
588 ("__forall", ITforall),
589 ("__letrec", ITletrec),
590 ("__coerce", ITcoerce),
591 ("__inline", ITinline),
592 ("__DEFAULT", ITdefaultbranch),
594 ("__integer", ITinteger_lit),
595 ("__float", ITfloat_lit),
596 ("__rational", ITrational_lit),
597 ("__addr", ITaddr_lit),
598 ("__litlit", ITlit_lit),
599 ("__string", ITstring_lit),
602 ("__P", ITspecialise),
604 ("__u", ITunfold NoInlinePragInfo),
605 ("__U", ITunfold IWantToBeINLINEd),
606 ("__UU", ITunfold IMustBeINLINEd),
607 ("__Unot", ITunfold IMustNotBeINLINEd),
608 ("__Ux", ITunfold IAmALoopBreaker),
610 ("__ccall", ITccall (False, False, False)),
611 ("__ccall_GC", ITccall (False, False, True)),
612 ("__dyn_ccall", ITccall (True, False, False)),
613 ("__dyn_ccall_GC", ITccall (True, False, True)),
614 ("__casm", ITccall (False, True, False)),
615 ("__dyn_casm", ITccall (True, True, False)),
616 ("__casm_GC", ITccall (False, True, True)),
617 ("__dyn_casm_GC", ITccall (True, True, True)),
622 haskellKeywordsFM = listToUFM $
623 map (\ (x,y) -> (_PK_ x,y))
625 ( "class", ITclass ),
627 ( "default", ITdefault ),
628 ( "deriving", ITderiving ),
632 ( "import", ITimport ),
634 ( "infix", ITinfix ),
635 ( "infixl", ITinfixl ),
636 ( "infixr", ITinfixr ),
637 ( "instance", ITinstance ),
639 ( "module", ITmodule ),
640 ( "newtype", ITnewtype ),
646 -- These three aren't Haskell keywords at all
647 -- and 'as' is often used as a variable name
649 -- ( "qualified", ITqualified ),
650 -- ( "hiding", IThiding )
654 haskellKeySymsFM = listToUFM $
655 map (\ (x,y) -> (_PK_ x,y))
671 -----------------------------------------------------------------------------
672 doDiscard rips along really fast, looking for a '#-}',
673 indicating the end of the pragma we're skipping
676 doDiscard inStr buf =
677 case currentChar# buf of
679 case lookAhead# buf 1# of { '#'# ->
680 case lookAhead# buf 2# of { '-'# ->
681 case lookAhead# buf 3# of { '}'# ->
682 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
683 _ -> doDiscard inStr (incLexeme buf) };
684 _ -> doDiscard inStr (incLexeme buf) };
685 _ -> doDiscard inStr (incLexeme buf) }
688 odd_slashes buf flg i# =
689 case lookAhead# buf i# of
690 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
693 case lookAhead# buf (negateInt# 1#) of --backwards, actually
694 '\\'# -> -- escaping something..
695 if odd_slashes buf True (negateInt# 2#) then
696 -- odd number of slashes, " is escaped.
697 doDiscard inStr (incLexeme buf)
699 -- even number of slashes, \ is escaped.
700 doDiscard (not inStr) (incLexeme buf)
701 _ -> case inStr of -- forced to avoid build-up
702 True -> doDiscard False (incLexeme buf)
703 False -> doDiscard True (incLexeme buf)
704 _ -> doDiscard inStr (incLexeme buf)
708 -----------------------------------------------------------------------------
711 type IfM a = StringBuffer -- Input string
713 -> MaybeErr a {-error-}Message
715 returnIf :: a -> IfM a
716 returnIf a s l = Succeeded a
718 thenIf :: IfM a -> (a -> IfM b) -> IfM b
719 m `thenIf` k = \s l ->
721 Succeeded a -> k a s l
722 Failed err -> Failed err
724 getSrcLocIf :: IfM SrcLoc
725 getSrcLocIf s l = Succeeded l
728 happyError s l = Failed (ifaceParseErr s l)
732 Note that if the name of the file we're processing ends
733 with `hi-boot', we accept it on faith as having the right
734 version. This is done so that .hi-boot files that comes
735 with hsc don't have to be updated before every release,
736 *and* it allows us to share .hi-boot files with versions
737 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
739 If the version number is 0, the checking is also turned off.
740 (needed to deal with GHC.hi only!)
742 Once we can assume we're compiling with a version of ghc that
743 supports interface file checking, we can drop the special
746 checkVersion :: Maybe Integer -> IfM ()
747 checkVersion mb@(Just v) s l
748 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = Succeeded ()
749 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
750 checkVersion mb@Nothing s l
751 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
752 | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
754 -----------------------------------------------------------------
756 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
758 = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
759 ptext SLIT("current input ="), text first_bit]
761 first_bit = lexemeToString (stepOnBy# s 100#)
763 ifaceVersionErr hi_vers l toks
764 = hsep [ppr l, ptext SLIT("Interface file version error;"),
765 ptext SLIT("Expected"), int opt_HiVersion,
766 ptext SLIT("found "), pp_version]
770 Nothing -> ptext SLIT("pre ghc-3.02 version")
771 Just v -> ptext SLIT("version") <+> integer v