2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
22 ifaceParseErr, srcParseErr,
25 Token(..), lexer, ParseResult(..), PState(..),
26 checkVersion, ExtFlags(..), mkPState,
29 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
30 getSrcLocP, setSrcLocP, getSrcFile,
31 layoutOn, layoutOff, pushContext, popContext
34 #include "HsVersions.h"
36 import Char ( isSpace, toUpper )
37 import List ( isSuffixOf )
39 import PrelNames ( mkTupNameStr )
40 import CmdLineOpts ( opt_HiVersion, opt_NoHiCheck )
41 import ForeignCall ( Safety(..) )
42 import NewDemand ( StrictSig(..), Demand(..), Demands(..),
43 DmdResult(..), mkTopDmdType, evalDmd, lazyDmd )
44 import UniqFM ( listToUFM, lookupUFM )
45 import BasicTypes ( Boxity(..) )
46 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
47 replaceSrcLine, mkSrcLoc )
49 import ErrUtils ( Message )
56 import Char ( chr, ord )
57 import PrelRead ( readRational__ ) -- Glasgow non-std
58 import PrelBits ( Bits(..) ) -- non-std
61 %************************************************************************
63 \subsection{Data types}
65 %************************************************************************
67 The token data type, fairly un-interesting except from one
68 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
69 strictness, unfolding etc).
71 The Idea/Observation here is that the renamer needs to scan through
72 all of an interface file before it can continue. But only a fraction
73 of the information contained in the file turns out to be useful, so
74 delaying as much as possible of the scanning and parsing of an
75 interface file Makes Sense (Heap profiles of the compiler
76 show a reduction in heap usage by at least a factor of two,
79 Hence, the interface file lexer spots when value declarations are
80 being scanned and return the @ITidinfo@ and @ITtype@ constructors
81 for the type and any other id info for that binding (unfolding, strictness
82 etc). These constructors are applied to the result of lexing these sub-chunks.
84 The lexing of the type and id info is all done lazily, of course, so
85 the scanning (and subsequent parsing) will be done *only* on the ids the
86 renamer finds out that it is interested in. The rest will just be junked.
87 Laziness, you know it makes sense :-)
91 = ITas -- Haskell keywords
115 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
117 | ITforall -- GHC extension keywords
129 | ITinterface -- interface keywords
137 | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc)
164 | ITspecialise_prag -- Pragmas
174 | ITdotdot -- reserved symbols
188 | ITbiglam -- GHC-extension symbols
190 | ITocurly -- special symbols
192 | ITocurlybar -- {|, for type applications
193 | ITccurlybar -- |}, for type applications
196 | ITopabrack -- [:, for parallel arrays with -fparr
197 | ITcpabrack -- :], for parallel arrays with -fparr
208 | ITvarid FAST_STRING -- identifiers
209 | ITconid FAST_STRING
210 | ITvarsym FAST_STRING
211 | ITconsym FAST_STRING
212 | ITqvarid (FAST_STRING,FAST_STRING)
213 | ITqconid (FAST_STRING,FAST_STRING)
214 | ITqvarsym (FAST_STRING,FAST_STRING)
215 | ITqconsym (FAST_STRING,FAST_STRING)
217 | ITdupipvarid FAST_STRING -- GHC extension: implicit param: ?x
218 | ITsplitipvarid FAST_STRING -- GHC extension: implicit param: %x
220 | ITpragma StringBuffer
223 | ITstring FAST_STRING
225 | ITrational Rational
228 | ITprimstring FAST_STRING
230 | ITprimfloat Rational
231 | ITprimdouble Rational
232 | ITlitlit FAST_STRING
234 | ITunknown String -- Used when the lexer can't make sense of it
235 | ITeof -- end of file token
236 deriving Show -- debugging
239 -----------------------------------------------------------------------------
243 pragmaKeywordsFM = listToUFM $
244 map (\ (x,y) -> (_PK_ x,y))
245 [( "SPECIALISE", ITspecialise_prag ),
246 ( "SPECIALIZE", ITspecialise_prag ),
247 ( "SOURCE", ITsource_prag ),
248 ( "INLINE", ITinline_prag ),
249 ( "NOINLINE", ITnoinline_prag ),
250 ( "NOTINLINE", ITnoinline_prag ),
251 ( "LINE", ITline_prag ),
252 ( "RULES", ITrules_prag ),
253 ( "RULEZ", ITrules_prag ), -- american spelling :-)
254 ( "SCC", ITscc_prag ),
255 ( "DEPRECATED", ITdeprecated_prag )
258 haskellKeywordsFM = listToUFM $
259 map (\ (x,y) -> (_PK_ x,y))
260 [( "_", ITunderscore ),
263 ( "class", ITclass ),
265 ( "default", ITdefault ),
266 ( "deriving", ITderiving ),
269 ( "hiding", IThiding ),
271 ( "import", ITimport ),
273 ( "infix", ITinfix ),
274 ( "infixl", ITinfixl ),
275 ( "infixr", ITinfixr ),
276 ( "instance", ITinstance ),
278 ( "module", ITmodule ),
279 ( "newtype", ITnewtype ),
281 ( "qualified", ITqualified ),
284 ( "where", ITwhere ),
285 ( "_scc_", ITscc ) -- ToDo: remove
288 isSpecial :: Token -> Bool
289 -- If we see M.x, where x is a keyword, but
290 -- is special, we treat is as just plain M.x,
292 isSpecial ITas = True
293 isSpecial IThiding = True
294 isSpecial ITqualified = True
295 isSpecial ITforall = True
296 isSpecial ITexport = True
297 isSpecial ITlabel = True
298 isSpecial ITdynamic = True
299 isSpecial ITsafe = True
300 isSpecial ITunsafe = True
301 isSpecial ITwith = True
302 isSpecial ITccallconv = True
303 isSpecial ITstdcallconv = True
306 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
307 ghcExtensionKeywordsFM = listToUFM $
308 map (\ (x,y) -> (_PK_ x,y))
309 [ ( "forall", ITforall ),
310 ( "foreign", ITforeign ),
311 ( "export", ITexport ),
312 ( "label", ITlabel ),
313 ( "dynamic", ITdynamic ),
314 ( "safe", ITunsafe ),
315 ( "unsafe", ITunsafe ),
317 ( "stdcall", ITstdcallconv),
318 ( "ccall", ITccallconv),
319 ( "dotnet", ITdotnet),
320 ("_ccall_", ITccall (False, False, PlayRisky)),
321 ("_ccall_GC_", ITccall (False, False, PlaySafe)),
322 ("_casm_", ITccall (False, True, PlayRisky)),
323 ("_casm_GC_", ITccall (False, True, PlaySafe)),
325 -- interface keywords
326 ("__interface", ITinterface),
327 ("__export", IT__export),
328 ("__depends", ITdepends),
329 ("__forall", IT__forall),
330 ("__letrec", ITletrec),
331 ("__coerce", ITcoerce),
332 ("__inline_me", ITinlineMe),
333 ("__inline_call", ITinlineCall),
334 ("__depends", ITdepends),
335 ("__DEFAULT", ITdefaultbranch),
337 ("__integer", ITinteger_lit),
338 ("__float", ITfloat_lit),
339 ("__int64", ITint64_lit),
340 ("__word", ITword_lit),
341 ("__word64", ITword64_lit),
342 ("__rational", ITrational_lit),
343 ("__addr", ITaddr_lit),
344 ("__label", ITlabel_lit),
345 ("__litlit", ITlit_lit),
346 ("__string", ITstring_lit),
349 ("__fuall", ITfuall),
351 ("__P", ITspecialise),
354 ("__D", ITdeprecated),
357 ("__ccall", ITccall (False, False, PlayRisky)),
358 ("__ccall_GC", ITccall (False, False, PlaySafe)),
359 ("__dyn_ccall", ITccall (True, False, PlayRisky)),
360 ("__dyn_ccall_GC", ITccall (True, False, PlaySafe)),
361 ("__casm", ITccall (False, True, PlayRisky)),
362 ("__dyn_casm", ITccall (True, True, PlayRisky)),
363 ("__casm_GC", ITccall (False, True, PlaySafe)),
364 ("__dyn_casm_GC", ITccall (True, True, PlaySafe)),
370 haskellKeySymsFM = listToUFM $
371 map (\ (x,y) -> (_PK_ x,y))
384 ,(".", ITdot) -- sadly, for 'forall a . t'
388 -----------------------------------------------------------------------------
393 - (exts) lexing a source with extensions, eg, an interface file or
395 - (bol) pointer to beginning of line (for column calculations)
396 - (buf) pointer to beginning of token
397 - (buf) pointer to current char
398 - (atbol) flag indicating whether we're at the beginning of a line
401 lexer :: (Token -> P a) -> P a
402 lexer cont buf s@(PState{
410 -- first, start a new lexeme and lose all the whitespace
412 tab line bol atbol (stepOverLexeme buf)
414 line = srcLocLine loc
416 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
417 case currentChar# buf of
420 if bufferExhausted (stepOn buf)
421 then cont ITeof buf s'
422 else trace "lexer: misplaced NUL?" $
423 tab y bol atbol (stepOn buf)
425 '\n'# -> let buf' = stepOn buf
426 in tab (y +# 1#) (currentIndex# buf') 1# buf'
428 -- find comments. This got harder in Haskell 98.
429 '-'# -> let trundle n =
430 let next = lookAhead# buf n in
431 if next `eqChar#` '-'# then trundle (n +# 1#)
432 else if is_symbol next || n <# 2#
435 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
438 -- comments and pragmas. We deal with LINE pragmas here,
439 -- and throw out any unrecognised pragmas as comments. Any
440 -- pragmas we know about are dealt with later (after any layout
441 -- processing if necessary).
442 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
443 if lookAhead# buf 2# `eqChar#` '#'# then
444 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
445 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
446 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
447 let lexeme = mkFastString -- ToDo: too slow
448 (map toUpper (lexemeToString buf2)) in
449 case lookupUFM pragmaKeywordsFM lexeme of
450 -- ignore RULES pragmas when -fglasgow-exts is off
451 Just ITrules_prag | not (glaExtsEnabled exts) ->
452 skip_to_end (stepOnBy# buf 2#) s'
454 line_prag skip_to_end buf2 s'
455 Just other -> is_a_token
456 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
459 else skip_to_end (stepOnBy# buf 2#) s'
461 skip_to_end = skipNestedComment (lexer cont)
463 -- special GHC extension: we grok cpp-style #line pragmas
464 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
465 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
466 lookAhead# buf 2# `eqChar#` 'i'# &&
467 lookAhead# buf 3# `eqChar#` 'n'# &&
468 lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5#
469 | otherwise = stepOn buf
471 case expandWhile# is_space buf1 of { buf2 ->
472 if is_digit (currentChar# buf2)
473 then line_prag next_line buf2 s'
477 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
479 -- tabs have been expanded beforehand
480 c | is_space c -> tab y bol atbol (stepOn buf)
481 | otherwise -> is_a_token
483 where s' = s{loc = replaceSrcLine loc y,
487 is_a_token | atbol /=# 0# = lexBOL cont buf s'
488 | otherwise = lexToken cont exts buf s'
490 -- {-# LINE .. #-} pragmas. yeuch.
491 line_prag cont buf s@PState{loc=loc} =
492 case expandWhile# is_space buf of { buf1 ->
493 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
494 -- subtract one: the line number refers to the *following* line.
495 let real_line = line - 1 in
496 case fromInteger real_line of { i@(I# l) ->
497 -- ToDo, if no filename then we skip the newline.... d'oh
498 case expandWhile# is_space buf2 of { buf3 ->
499 case currentChar# buf3 of
501 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
503 file = lexemeToFastString buf4
504 new_buf = stepOn (stepOverLexeme buf4)
506 if nullFastString file
507 then cont new_buf s{loc = replaceSrcLine loc l}
508 else cont new_buf s{loc = mkSrcLoc file i}
510 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
513 skipNestedComment :: P a -> P a
514 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
516 skipNestedComment' :: SrcLoc -> P a -> P a
517 skipNestedComment' orig_loc cont buf = loop buf
520 case currentChar# buf of
521 '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
523 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
525 (skipNestedComment' orig_loc cont)
528 '\n'# -> \ s@PState{loc=loc} ->
529 let buf' = stepOn buf in
530 loop buf' s{loc = incSrcLine loc,
531 bol = currentIndex# buf',
534 -- pass the original SrcLoc to lexError so that the error is
535 -- reported at the line it was originally on, not the line at
536 -- the end of the file.
537 '\NUL'# | bufferExhausted (stepOn buf) ->
538 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
540 _ -> loop (stepOn buf)
542 -- When we are lexing the first token of a line, check whether we need to
543 -- insert virtual semicolons or close braces due to layout.
545 lexBOL :: (Token -> P a) -> P a
546 lexBOL cont buf s@(PState{
553 if need_close_curly then
554 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
555 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
556 else if need_semi_colon then
557 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
558 cont ITsemi buf s{atbol = 0#}
560 lexToken cont exts buf s{atbol = 0#}
562 col = currentIndex# buf -# bol
575 Layout n -> col ==# n
578 lexToken :: (Token -> P a) -> Int# -> P a
579 lexToken cont exts buf =
580 -- trace "lexToken" $
581 case currentChar# buf of
583 -- special symbols ----------------------------------------------------
584 '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
585 -> cont IToubxparen (setCurrentPos# buf 2#)
587 -> cont IToparen (incLexeme buf)
589 ')'# -> cont ITcparen (incLexeme buf)
590 '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
591 cont ITopabrack (setCurrentPos# buf 2#)
593 cont ITobrack (incLexeme buf)
594 ']'# -> cont ITcbrack (incLexeme buf)
595 ','# -> cont ITcomma (incLexeme buf)
596 ';'# -> cont ITsemi (incLexeme buf)
597 '}'# -> \ s@PState{context = ctx} ->
599 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
600 _ -> lexError "too many '}'s" buf s
601 '|'# -> case lookAhead# buf 1# of
602 '}'# | glaExtsEnabled exts -> cont ITccurlybar
603 (setCurrentPos# buf 2#)
604 _ -> lex_sym cont (incLexeme buf)
605 ':'# -> case lookAhead# buf 1# of
606 ']'# | parrEnabled exts -> cont ITcpabrack
607 (setCurrentPos# buf 2#)
608 _ -> lex_sym cont (incLexeme buf)
611 '#'# -> case lookAhead# buf 1# of
612 ')'# | glaExtsEnabled exts
613 -> cont ITcubxparen (setCurrentPos# buf 2#)
614 '-'# -> case lookAhead# buf 2# of
615 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
616 _ -> lex_sym cont (incLexeme buf)
617 _ -> lex_sym cont (incLexeme buf)
619 '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
620 -> lex_cstring cont (setCurrentPos# buf 2#)
622 -> cont ITbackquote (incLexeme buf)
624 '{'# -> -- look for "{-##" special iface pragma -- for Emacs: -}
625 case lookAhead# buf 1# of
626 '|'# | glaExtsEnabled exts
627 -> cont ITocurlybar (setCurrentPos# buf 2#)
628 '-'# -> case lookAhead# buf 2# of
629 '#'# -> case lookAhead# buf 3# of
633 (\ cont lexeme buf' -> cont (ITpragma lexeme) buf')
635 (stepOnBy# (stepOverLexeme buf) 4#)
636 _ -> lex_prag cont (setCurrentPos# buf 3#)
637 _ -> cont ITocurly (incLexeme buf)
638 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
640 -- strings/characters -------------------------------------------------
641 '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
642 '\''# -> lex_char (char_end cont) exts (incLexeme buf)
644 -- strictness and cpr pragmas and __scc treated specially.
645 '_'# | glaExtsEnabled exts ->
646 case lookAhead# buf 1# of
647 '_'# -> case lookAhead# buf 2# of
649 lex_demand cont (stepOnUntil (not . isSpace)
650 (stepOnBy# buf 3#)) -- past __S
652 cont ITcprinfo (stepOnBy# buf 3#) -- past __M
655 case prefixMatch (stepOnBy# buf 3#) "cc" of
656 Just buf' -> lex_scc cont (stepOverLexeme buf')
657 Nothing -> lex_id cont exts buf
658 _ -> lex_id cont exts buf
659 _ -> lex_id cont exts buf
661 -- Hexadecimal and octal constants
662 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
663 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
664 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
665 -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec
666 where ch = lookAhead# buf 1#
667 ch2 = lookAhead# buf 2#
668 buf' = setCurrentPos# buf 2#
671 if bufferExhausted (stepOn buf) then
674 trace "lexIface: misplaced NUL?" $
675 cont (ITunknown "\NUL") (stepOn buf)
677 '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
678 lex_ip ITdupipvarid cont (incLexeme buf)
679 '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
680 lex_ip ITsplitipvarid cont (incLexeme buf)
681 c | is_digit c -> lex_num cont exts 0 buf
682 | is_symbol c -> lex_sym cont buf
683 | is_upper c -> lex_con cont exts buf
684 | is_ident c -> lex_id cont exts buf
685 | otherwise -> lexError "illegal character" buf
687 -- Int# is unlifted, and therefore faster than Bool for flags.
693 -------------------------------------------------------------------------------
697 = case expandWhile# is_space buf of { buf1 ->
698 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
699 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
700 case lookupUFM pragmaKeywordsFM lexeme of
701 Just kw -> cont kw (mergeLexemes buf buf2)
702 Nothing -> panic "lex_prag"
705 -------------------------------------------------------------------------------
708 lex_string cont exts s buf
709 = case currentChar# buf of
711 let buf' = incLexeme buf
712 s' = mkFastStringNarrow (map chr (reverse s))
713 in case currentChar# buf' of
714 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
715 then cont (ITprimstring s') (incLexeme buf')
716 else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
717 _ -> cont (ITstring s') buf'
719 -- ignore \& in a string, deal with string gaps
720 '\\'# | next_ch `eqChar#` '&'#
721 -> lex_string cont exts s buf'
723 -> lex_stringgap cont exts s (incLexeme buf)
725 where next_ch = lookAhead# buf 1#
726 buf' = setCurrentPos# buf 2#
728 _ -> lex_char (lex_next_string cont s) exts buf
730 lex_stringgap cont exts s buf
731 = let buf' = incLexeme buf in
732 case currentChar# buf of
733 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
734 st{loc = incSrcLine loc}
735 '\\'# -> lex_string cont exts s buf'
736 c | is_space c -> lex_stringgap cont exts s buf'
737 other -> charError buf'
739 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
741 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
742 lex_char cont exts buf
743 = case currentChar# buf of
744 '\\'# -> lex_escape (cont exts) (incLexeme buf)
745 c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
746 other -> charError buf
748 char_end cont exts c buf
749 = case currentChar# buf of
750 '\''# -> let buf' = incLexeme buf in
751 case currentChar# buf' of
752 '#'# | glaExtsEnabled exts
753 -> cont (ITprimchar c) (incLexeme buf')
754 _ -> cont (ITchar c) buf'
758 = let buf' = incLexeme buf in
759 case currentChar# buf of
760 'a'# -> cont (ord '\a') buf'
761 'b'# -> cont (ord '\b') buf'
762 'f'# -> cont (ord '\f') buf'
763 'n'# -> cont (ord '\n') buf'
764 'r'# -> cont (ord '\r') buf'
765 't'# -> cont (ord '\t') buf'
766 'v'# -> cont (ord '\v') buf'
767 '\\'# -> cont (ord '\\') buf'
768 '"'# -> cont (ord '\"') buf'
769 '\''# -> cont (ord '\'') buf'
770 '^'# -> let c = currentChar# buf' in
771 if c `geChar#` '@'# && c `leChar#` '_'#
772 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
775 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
776 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
778 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
780 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
781 Just buf2 <- [prefixMatch buf p] ] of
782 (c,buf2):_ -> cont (ord c) buf2
785 after_charnum cont i buf
786 = if i >= 0 && i <= 0x10FFFF
787 then cont (fromInteger i) buf
790 readNum cont buf is_digit base conv = read buf 0
792 = case currentChar# buf of { c ->
794 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
800 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
801 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
803 hex c | is_digit c = ord# c -# ord# '0'#
804 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
805 oct_or_dec c = ord# c -# ord# '0'#
807 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
810 | c `geChar#` 'A'# && c `leChar#` 'Z'#
811 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
814 charError buf = lexError "error in character literal" buf
816 silly_escape_chars = [
853 -------------------------------------------------------------------------------
855 lex_demand cont buf =
856 case read_em [] buf of { (ls,buf') ->
857 case currentChar# buf' of
858 'b'# -> cont (ITstrict (StrictSig (mkTopDmdType ls BotRes))) (incLexeme buf')
859 'm'# -> cont (ITstrict (StrictSig (mkTopDmdType ls RetCPR))) (incLexeme buf')
860 _ -> cont (ITstrict (StrictSig (mkTopDmdType ls TopRes))) buf'
864 case currentChar# buf of
865 'T'# -> read_em (Top : acc) (stepOn buf)
866 'L'# -> read_em (lazyDmd : acc) (stepOn buf)
867 'A'# -> read_em (Abs : acc) (stepOn buf)
868 'V'# -> read_em (evalDmd : acc) (stepOn buf) -- Temporary, until
869 -- we've recompiled prelude etc
870 'C'# -> do_unary Call acc (stepOnBy# buf 2#) -- Skip 'C('
872 'U'# -> do_seq1 Eval acc (stepOnBy# buf 1#)
873 'D'# -> do_seq1 Defer acc (stepOnBy# buf 1#)
874 'S'# -> do_seq1 (Box . Eval) acc (stepOnBy# buf 1#)
876 _ -> (reverse acc, buf)
879 = case currentChar# buf of
880 '('# -> do_seq2 fn acc (stepOnBy# buf 1#)
881 _ -> read_em (fn (Poly Abs) : acc) buf
884 = case read_em [] buf of { (dmds, buf) ->
885 case currentChar# buf of
886 ')'# -> read_em (fn (Prod dmds) : acc)
888 '*'# -> ASSERT( length dmds == 1 )
889 read_em (fn (Poly (head dmds)) : acc)
890 (stepOnBy# buf 2#) -- Skip '*)'
894 = case read_em [] buf of
895 ([dmd], rest) -> read_em (fn dmd : acc) (stepOn rest) -- Skip ')'
899 case currentChar# buf of
900 'C'# -> cont ITsccAllCafs (incLexeme buf)
901 other -> cont ITscc buf
903 -----------------------------------------------------------------------------
906 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
907 lex_num cont exts acc buf =
908 case scanNumLit acc buf of
910 case currentChar# buf' of
911 '.'# | is_digit (lookAhead# buf' 1#) ->
912 -- this case is not optimised at all, as the
913 -- presence of floating point numbers in interface
914 -- files is not that common. (ToDo)
915 case expandWhile# is_digit (incLexeme buf') of
916 buf2 -> -- points to first non digit char
918 let l = case currentChar# buf2 of
924 = let buf3 = incLexeme buf2 in
925 case currentChar# buf3 of
926 '-'# -> expandWhile# is_digit (incLexeme buf3)
927 '+'# -> expandWhile# is_digit (incLexeme buf3)
928 x | is_digit x -> expandWhile# is_digit buf3
931 v = readRational__ (lexemeToString l)
933 in case currentChar# l of -- glasgow exts only
934 '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
935 case currentChar# l' of
936 '#'# -> cont (ITprimdouble v) (incLexeme l')
937 _ -> cont (ITprimfloat v) l'
938 _ -> cont (ITrational v) l
940 _ -> after_lexnum cont exts acc' buf'
942 after_lexnum cont exts i buf
943 = case currentChar# buf of
944 '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
945 _ -> cont (ITinteger i) buf
947 -----------------------------------------------------------------------------
948 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
950 -- we lexemeToFastString on the bit between the ``''s, but include the
951 -- quotes in the full lexeme.
953 lex_cstring cont buf =
954 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
955 Just buf' -> cont (ITlitlit (lexemeToFastString
956 (setCurrentPos# buf' (negateInt# 2#))))
957 (mergeLexemes buf buf')
958 Nothing -> lexError "unterminated ``" buf
960 -----------------------------------------------------------------------------
961 -- identifiers, symbols etc.
963 lex_ip ip_constr cont buf =
964 case expandWhile# is_ident buf of
965 buf' -> cont (ip_constr (tailFS lexeme)) buf'
966 where lexeme = lexemeToFastString buf'
968 lex_id cont exts buf =
969 let buf1 = expandWhile# is_ident buf in
972 case (if glaExtsEnabled exts
973 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
974 else buf1) of { buf' ->
976 let lexeme = lexemeToFastString buf' in
978 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
979 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
983 let var_token = cont (ITvarid lexeme) buf' in
985 if not (glaExtsEnabled exts)
989 case lookupUFM ghcExtensionKeywordsFM lexeme of {
990 Just kwd_token -> cont kwd_token buf';
997 case expandWhile# is_symbol buf of
998 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
999 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
1000 cont kwd_token buf' ;
1001 Nothing -> --trace ("sym: "++unpackFS lexeme) $
1002 cont (mk_var_token lexeme) buf'
1004 where lexeme = lexemeToFastString buf'
1007 -- lex_con recursively collects components of a qualified identifer.
1008 -- The argument buf is the StringBuffer representing the lexeme
1009 -- identified so far, where the next character is upper-case.
1011 lex_con cont exts buf =
1012 -- trace ("con: "{-++unpackFS lexeme-}) $
1013 let empty_buf = stepOverLexeme buf in
1014 case expandWhile# is_ident empty_buf of { buf1 ->
1015 case slurp_trailing_hashes buf1 exts of { con_buf ->
1017 let all_buf = mergeLexemes buf con_buf
1019 con_lexeme = lexemeToFastString con_buf
1020 mod_lexeme = lexemeToFastString (decLexeme buf)
1021 all_lexeme = lexemeToFastString all_buf
1024 | emptyLexeme buf = cont (ITconid con_lexeme) all_buf
1025 | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1028 case currentChar# all_buf of
1029 '.'# -> maybe_qualified cont exts all_lexeme
1030 (incLexeme all_buf) just_a_conid
1035 maybe_qualified cont exts mod buf just_a_conid =
1036 -- trace ("qid: "{-++unpackFS lexeme-}) $
1037 case currentChar# buf of
1038 '['# -> -- Special case for []
1039 case lookAhead# buf 1# of
1040 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
1043 '('# -> -- Special case for (,,,)
1044 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1045 case lookAhead# buf 1# of
1046 '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1047 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
1050 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
1051 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
1054 '-'# -> case lookAhead# buf 1# of
1055 '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
1056 _ -> lex_id3 cont exts mod buf just_a_conid
1058 _ -> lex_id3 cont exts mod buf just_a_conid
1061 lex_id3 cont exts mod buf just_a_conid
1062 | is_upper (currentChar# buf) =
1063 lex_con cont exts buf
1065 | is_symbol (currentChar# buf) =
1067 start_new_lexeme = stepOverLexeme buf
1069 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1070 case expandWhile# is_symbol start_new_lexeme of { buf' ->
1072 lexeme = lexemeToFastString buf'
1073 -- real lexeme is M.<sym>
1074 new_buf = mergeLexemes buf buf'
1076 cont (mk_qvar_token mod lexeme) new_buf
1077 -- wrong, but arguably morally right: M... is now a qvarsym
1082 start_new_lexeme = stepOverLexeme buf
1084 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1085 case expandWhile# is_ident start_new_lexeme of { buf1 ->
1090 case slurp_trailing_hashes buf1 exts of { buf' ->
1093 lexeme = lexemeToFastString buf'
1094 new_buf = mergeLexemes buf buf'
1095 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1097 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1098 Nothing -> is_a_qvarid ;
1100 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
1101 -> is_a_qvarid -- recognised as keywords here.
1103 -> just_a_conid -- avoid M.where etc.
1106 slurp_trailing_hashes buf exts
1107 | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1112 | is_upper f = ITconid pk_str
1113 | is_ident f = ITvarid pk_str
1114 | f `eqChar#` ':'# = ITconsym pk_str
1115 | otherwise = ITvarsym pk_str
1117 (C# f) = _HEAD_ pk_str
1118 -- tl = _TAIL_ pk_str
1120 mk_qvar_token m token =
1121 -- trace ("mk_qvar ") $
1122 case mk_var_token token of
1123 ITconid n -> ITqconid (m,n)
1124 ITvarid n -> ITqvarid (m,n)
1125 ITconsym n -> ITqconsym (m,n)
1126 ITvarsym n -> ITqvarsym (m,n)
1127 _ -> ITunknown (show token)
1130 ----------------------------------------------------------------------------
1131 Horrible stuff for dealing with M.(,,,)
1134 lex_tuple cont mod buf back_off =
1138 case currentChar# buf of
1139 ','# -> go (n+1) (stepOn buf)
1140 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1143 lex_ubx_tuple cont mod buf back_off =
1147 case currentChar# buf of
1148 ','# -> go (n+1) (stepOn buf)
1149 '#'# -> case lookAhead# buf 1# of
1150 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1156 -----------------------------------------------------------------------------
1157 'lexPragma' rips along really fast, looking for a '##-}',
1158 indicating the end of the pragma we're skipping
1161 lexPragma cont contf inStr buf =
1162 case currentChar# buf of
1163 '#'# | inStr ==# 0# ->
1164 case lookAhead# buf 1# of { '#'# ->
1165 case lookAhead# buf 2# of { '-'# ->
1166 case lookAhead# buf 3# of { '}'# ->
1167 contf cont (lexemeToBuffer buf)
1168 (stepOverLexeme (setCurrentPos# buf 4#));
1169 _ -> lexPragma cont contf inStr (incLexeme buf) };
1170 _ -> lexPragma cont contf inStr (incLexeme buf) };
1171 _ -> lexPragma cont contf inStr (incLexeme buf) }
1175 odd_slashes buf flg i# =
1176 case lookAhead# buf i# of
1177 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1180 not_inStr = if inStr ==# 0# then 1# else 0#
1182 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1183 '\\'# -> -- escaping something..
1184 if odd_slashes buf True (negateInt# 2#)
1185 then -- odd number of slashes, " is escaped.
1186 lexPragma cont contf inStr (incLexeme buf)
1187 else -- even number of slashes, \ is escaped.
1188 lexPragma cont contf not_inStr (incLexeme buf)
1189 _ -> lexPragma cont contf not_inStr (incLexeme buf)
1191 '\''# | inStr ==# 0# ->
1192 case lookAhead# buf 1# of { '"'# ->
1193 case lookAhead# buf 2# of { '\''# ->
1194 lexPragma cont contf inStr (setCurrentPos# buf 3#);
1195 _ -> lexPragma cont contf inStr (incLexeme buf) };
1196 _ -> lexPragma cont contf inStr (incLexeme buf) }
1198 -- a sign that the input is ill-formed, since pragmas are
1199 -- assumed to always be properly closed (in .hi files).
1200 '\NUL'# -> trace "lexPragma: unexpected end-of-file" $
1201 cont (ITunknown "\NUL") buf
1203 _ -> lexPragma cont contf inStr (incLexeme buf)
1207 -----------------------------------------------------------------------------
1218 data PState = PState {
1220 extsBitmap :: Int#, -- bitmap that determines permitted extensions
1223 context :: [LayoutContext]
1226 type P a = StringBuffer -- Input string
1231 returnP a buf s = POk s a
1233 thenP :: P a -> (a -> P b) -> P b
1234 m `thenP` k = \ buf s ->
1236 POk s1 a -> k a buf s1
1237 PFailed err -> PFailed err
1239 thenP_ :: P a -> P b -> P b
1240 m `thenP_` k = m `thenP` \_ -> k
1242 mapP :: (a -> P b) -> [a] -> P [b]
1243 mapP f [] = returnP []
1246 mapP f as `thenP` \bs ->
1249 failP :: String -> P a
1250 failP msg buf s = PFailed (text msg)
1252 failMsgP :: Message -> P a
1253 failMsgP msg buf s = PFailed msg
1255 lexError :: String -> P a
1256 lexError str buf s@PState{ loc = loc }
1257 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1259 getSrcLocP :: P SrcLoc
1260 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1262 -- use a temporary SrcLoc for the duration of the argument
1263 setSrcLocP :: SrcLoc -> P a -> P a
1264 setSrcLocP new_loc p buf s =
1265 case p buf s{ loc=new_loc } of
1267 PFailed e -> PFailed e
1269 getSrcFile :: P FAST_STRING
1270 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1272 pushContext :: LayoutContext -> P ()
1273 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1277 This special case in layoutOn is to handle layout contexts with are
1278 indented the same or less than the current context. This is illegal
1279 according to the Haskell spec, so we have to arrange to close the
1280 current context. eg.
1285 after the first 'where', the sequence of events is:
1287 - layout system inserts a ';' (column 0)
1288 - parser begins a new context at column 0
1289 - parser shifts ';' (legal empty declaration)
1290 - parser sees 'class': parse error (we're still in the inner context)
1292 trouble is, by the time we know we need a new context, the lexer has
1293 already generated the ';'. Hacky solution is as follows: since we
1294 know the column of the next token (it's the column number of the new
1295 context), we set the ACTUAL column number of the new context to this
1296 numer plus one. Hence the next time the lexer is called, a '}' will
1297 be generated to close the new context straight away. Furthermore, we
1298 have to set the atbol flag so that the ';' that the parser shifted as
1299 part of the new context is re-generated.
1301 when the new context is *less* indented than the current one:
1303 f = f where g = g where
1306 - current context: column 12.
1307 - on seeing 'h' (column 0), the layout system inserts '}'
1308 - parser starts a new context, column 0
1309 - parser sees '}', uses it to close new context
1310 - we still need to insert another '}' followed by a ';',
1311 hence the atbol trick.
1313 There's also a special hack in here to deal with
1320 i.e. the inner context is at the same indentation level as the outer
1321 context. This is strictly illegal according to Haskell 98, but
1322 there's a lot of existing code using this style and it doesn't make
1323 any sense to disallow it, since empty 'do' lists don't make sense.
1326 layoutOn :: Bool -> P ()
1327 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1328 let offset = lexemeIndex buf -# bol in
1331 | if strict then prev_off >=# offset else prev_off ># offset ->
1332 --trace ("layout on, column: " ++ show (I# offset)) $
1333 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1335 --trace ("layout on, column: " ++ show (I# offset)) $
1336 POk s{ context = Layout offset : ctx } ()
1339 layoutOff buf s@(PState{ context = ctx }) =
1340 POk s{ context = NoLayout:ctx } ()
1343 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1345 (_:tl) -> POk s{ context = tl } ()
1346 [] -> PFailed (srcParseErr buf loc)
1349 Note that if the name of the file we're processing ends
1350 with `hi-boot', we accept it on faith as having the right
1351 version. This is done so that .hi-boot files that comes
1352 with hsc don't have to be updated before every release,
1353 *and* it allows us to share .hi-boot files with versions
1354 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1356 If the version number is 0, the checking is also turned off.
1357 (needed to deal with GHC.hi only!)
1359 Once we can assume we're compiling with a version of ghc that
1360 supports interface file checking, we can drop the special
1363 checkVersion :: Maybe Integer -> P ()
1364 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1365 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1366 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1367 checkVersion mb@Nothing buf s@(PState{loc = loc})
1368 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1369 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1372 -- for reasons of efficiency, flags indicating language extensions (eg,
1373 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1376 glaExtsBit, ffiBit, parrBit :: Int
1378 ffiBit = 1 -- FIXME: not used yet; still part of `glaExtsBit'
1381 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1382 glaExtsEnabled flags = testBit (I# flags) glaExtsBit
1383 ffiEnabled flags = testBit (I# flags) ffiBit
1384 parrEnabled flags = testBit (I# flags) parrBit
1386 -- convenient record-based bitmap for the interface to the rest of the world
1388 data ExtFlags = ExtFlags {
1389 glasgowExtsEF :: Bool,
1390 -- ffiEF :: Bool, -- commented out to avoid warnings
1391 parrEF :: Bool -- while not used yet
1394 -- create a parse state
1396 mkPState :: SrcLoc -> ExtFlags -> PState
1397 mkPState loc exts = PState {
1399 extsBitmap = case bitmap of {I# bits -> bits},
1405 bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts
1406 -- .|. ffiBit `setBitIf` ffiEF exts
1407 .|. parrBit `setBitIf` parrEF exts
1409 b `setBitIf` cond | cond = bit b
1413 -----------------------------------------------------------------
1415 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1417 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1418 text (lexemeToString s), char '\'']
1420 ifaceVersionErr hi_vers l toks
1421 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1422 ptext SLIT("Expected"), int opt_HiVersion,
1423 ptext SLIT("found "), pp_version]
1427 Nothing -> ptext SLIT("pre ghc-3.02 version")
1428 Just v -> ptext SLIT("version") <+> integer v
1430 -----------------------------------------------------------------------------
1432 srcParseErr :: StringBuffer -> SrcLoc -> Message
1436 then ptext SLIT(": parse error (possibly incorrect indentation)")
1437 else hcat [ptext SLIT(": parse error on input "),
1438 char '`', text token, char '\'']
1441 token = lexemeToString s