2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
22 ifaceParseErr, srcParseErr,
25 Token(..), lexer, ParseResult(..), PState(..),
29 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
30 getSrcLocP, getSrcFile,
31 layoutOn, layoutOff, pushContext, popContext
34 #include "HsVersions.h"
36 import Char ( isSpace, toUpper )
37 import List ( isSuffixOf )
39 import IdInfo ( InlinePragInfo(..) )
40 import PrelNames ( mkTupNameStr )
41 import CmdLineOpts ( opt_HiVersion, opt_NoHiCheck )
42 import Demand ( Demand(..) {- instance Read -} )
43 import UniqFM ( listToUFM, lookupUFM )
44 import BasicTypes ( NewOrData(..), Boxity(..) )
45 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
46 replaceSrcLine, mkSrcLoc )
48 import ErrUtils ( Message )
56 import PrelRead ( readRational__ ) -- Glasgow non-std
59 %************************************************************************
61 \subsection{Data types}
63 %************************************************************************
65 The token data type, fairly un-interesting except from one
66 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
67 strictness, unfolding etc).
69 The Idea/Observation here is that the renamer needs to scan through
70 all of an interface file before it can continue. But only a fraction
71 of the information contained in the file turns out to be useful, so
72 delaying as much as possible of the scanning and parsing of an
73 interface file Makes Sense (Heap profiles of the compiler
74 show a reduction in heap usage by at least a factor of two,
77 Hence, the interface file lexer spots when value declarations are
78 being scanned and return the @ITidinfo@ and @ITtype@ constructors
79 for the type and any other id info for that binding (unfolding, strictness
80 etc). These constructors are applied to the result of lexing these sub-chunks.
82 The lexing of the type and id info is all done lazily, of course, so
83 the scanning (and subsequent parsing) will be done *only* on the ids the
84 renamer finds out that it is interested in. The rest will just be junked.
85 Laziness, you know it makes sense :-)
89 = ITas -- Haskell keywords
115 | ITforall -- GHC extension keywords
125 | ITinterface -- interface keywords
134 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
153 | ITunfold InlinePragInfo
154 | ITstrict ([Demand], Bool)
161 | ITspecialise_prag -- Pragmas
170 | ITdotdot -- reserved symbols
184 | ITbiglam -- GHC-extension symbols
186 | ITocurly -- special symbols
188 | ITocurlybar -- {|, for type applications
189 | ITccurlybar -- |}, for type applications
202 | ITvarid FAST_STRING -- identifiers
203 | ITconid FAST_STRING
204 | ITvarsym FAST_STRING
205 | ITconsym FAST_STRING
206 | ITqvarid (FAST_STRING,FAST_STRING)
207 | ITqconid (FAST_STRING,FAST_STRING)
208 | ITqvarsym (FAST_STRING,FAST_STRING)
209 | ITqconsym (FAST_STRING,FAST_STRING)
211 | ITipvarid FAST_STRING -- GHC extension: implicit param: ?x
213 | ITpragma StringBuffer
216 | ITstring FAST_STRING
218 | ITrational Rational
221 | ITprimstring FAST_STRING
223 | ITprimfloat Rational
224 | ITprimdouble Rational
225 | ITlitlit FAST_STRING
227 | ITunknown String -- Used when the lexer can't make sense of it
228 | ITeof -- end of file token
229 deriving Show -- debugging
232 -----------------------------------------------------------------------------
236 pragmaKeywordsFM = listToUFM $
237 map (\ (x,y) -> (_PK_ x,y))
238 [( "SPECIALISE", ITspecialise_prag ),
239 ( "SPECIALIZE", ITspecialise_prag ),
240 ( "SOURCE", ITsource_prag ),
241 ( "INLINE", ITinline_prag ),
242 ( "NOINLINE", ITnoinline_prag ),
243 ( "NOTINLINE", ITnoinline_prag ),
244 ( "LINE", ITline_prag ),
245 ( "RULES", ITrules_prag ),
246 ( "RULEZ", ITrules_prag ), -- american spelling :-)
247 ( "DEPRECATED", ITdeprecated_prag )
250 haskellKeywordsFM = listToUFM $
251 map (\ (x,y) -> (_PK_ x,y))
252 [( "_", ITunderscore ),
255 ( "class", ITclass ),
257 ( "default", ITdefault ),
258 ( "deriving", ITderiving ),
261 ( "hiding", IThiding ),
263 ( "import", ITimport ),
265 ( "infix", ITinfix ),
266 ( "infixl", ITinfixl ),
267 ( "infixr", ITinfixr ),
268 ( "instance", ITinstance ),
270 ( "module", ITmodule ),
271 ( "newtype", ITnewtype ),
273 ( "qualified", ITqualified ),
276 ( "where", ITwhere ),
280 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
281 ghcExtensionKeywordsFM = listToUFM $
282 map (\ (x,y) -> (_PK_ x,y))
283 [ ( "forall", ITforall ),
284 ( "foreign", ITforeign ),
285 ( "export", ITexport ),
286 ( "label", ITlabel ),
287 ( "dynamic", ITdynamic ),
288 ( "unsafe", ITunsafe ),
290 ( "stdcall", ITstdcallconv),
291 ( "ccall", ITccallconv),
292 ("_ccall_", ITccall (False, False, False)),
293 ("_ccall_GC_", ITccall (False, False, True)),
294 ("_casm_", ITccall (False, True, False)),
295 ("_casm_GC_", ITccall (False, True, True)),
297 -- interface keywords
298 ("__interface", ITinterface),
300 ("__export", IT__export),
301 ("__depends", ITdepends),
302 ("__forall", IT__forall),
303 ("__letrec", ITletrec),
304 ("__coerce", ITcoerce),
305 ("__inline_me", ITinlineMe),
306 ("__inline_call", ITinlineCall),
307 ("__depends", ITdepends),
308 ("__DEFAULT", ITdefaultbranch),
310 ("__integer", ITinteger_lit),
311 ("__float", ITfloat_lit),
312 ("__int64", ITint64_lit),
313 ("__word", ITword_lit),
314 ("__word64", ITword64_lit),
315 ("__rational", ITrational_lit),
316 ("__addr", ITaddr_lit),
317 ("__label", ITlabel_lit),
318 ("__litlit", ITlit_lit),
319 ("__string", ITstring_lit),
322 ("__fuall", ITfuall),
324 ("__P", ITspecialise),
327 ("__D", ITdeprecated),
328 ("__U", ITunfold NoInlinePragInfo),
330 ("__ccall", ITccall (False, False, False)),
331 ("__ccall_GC", ITccall (False, False, True)),
332 ("__dyn_ccall", ITccall (True, False, False)),
333 ("__dyn_ccall_GC", ITccall (True, False, True)),
334 ("__casm", ITccall (False, True, False)),
335 ("__dyn_casm", ITccall (True, True, False)),
336 ("__casm_GC", ITccall (False, True, True)),
337 ("__dyn_casm_GC", ITccall (True, True, True)),
343 haskellKeySymsFM = listToUFM $
344 map (\ (x,y) -> (_PK_ x,y))
357 ,(".", ITdot) -- sadly, for 'forall a . t'
361 -----------------------------------------------------------------------------
366 - (glaexts) lexing an interface file or -fglasgow-exts
367 - (bol) pointer to beginning of line (for column calculations)
368 - (buf) pointer to beginning of token
369 - (buf) pointer to current char
370 - (atbol) flag indicating whether we're at the beginning of a line
373 lexer :: (Token -> P a) -> P a
374 lexer cont buf s@(PState{
376 glasgow_exts = glaexts,
382 -- first, start a new lexeme and lose all the whitespace
384 tab line bol atbol (stepOverLexeme buf)
386 line = srcLocLine loc
388 tab y bol atbol buf = -- trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
389 case currentChar# buf of
392 if bufferExhausted (stepOn buf)
393 then cont ITeof buf s'
394 else trace "lexer: misplaced NUL?" $
395 tab y bol atbol (stepOn buf)
397 '\n'# -> let buf' = stepOn buf
398 in tab (y +# 1#) (currentIndex# buf') 1# buf'
400 -- find comments. This got harder in Haskell 98.
401 '-'# -> let trundle n =
402 let next = lookAhead# buf n in
403 if next `eqChar#` '-'# then trundle (n +# 1#)
404 else if is_symbol next || n <# 2#
407 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
410 -- comments and pragmas. We deal with LINE pragmas here,
411 -- and throw out any unrecognised pragmas as comments. Any
412 -- pragmas we know about are dealt with later (after any layout
413 -- processing if necessary).
414 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
415 if lookAhead# buf 2# `eqChar#` '#'# then
416 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
417 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
418 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
419 let lexeme = mkFastString -- ToDo: too slow
420 (map toUpper (lexemeToString buf2)) in
421 case lookupUFM pragmaKeywordsFM lexeme of
423 line_prag skip_to_end buf2 s'
424 Just other -> is_a_token
425 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
428 else skip_to_end (stepOnBy# buf 2#) s'
430 skip_to_end = nested_comment (lexer cont)
432 -- special GHC extension: we grok cpp-style #line pragmas
433 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
434 line_prag next_line (stepOn buf) s'
436 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
438 -- tabs have been expanded beforehand
439 c | is_space c -> tab y bol atbol (stepOn buf)
440 | otherwise -> is_a_token
442 where s' = s{loc = replaceSrcLine loc y,
446 is_a_token | atbol /=# 0# = lexBOL cont buf s'
447 | otherwise = lexToken cont glaexts buf s'
449 -- {-# LINE .. #-} pragmas. yeuch.
450 line_prag cont buf s@PState{loc=loc} =
451 case expandWhile# is_space buf of { buf1 ->
452 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
453 -- subtract one: the line number refers to the *following* line.
454 let real_line = line - 1 in
455 case fromInteger real_line of { i@(I# l) ->
456 -- ToDo, if no filename then we skip the newline.... d'oh
457 case expandWhile# is_space buf2 of { buf3 ->
458 case currentChar# buf3 of
460 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
462 file = lexemeToFastString buf4
463 new_buf = stepOn (stepOverLexeme buf4)
465 if nullFastString file
466 then cont new_buf s{loc = replaceSrcLine loc l}
467 else cont new_buf s{loc = mkSrcLoc file i}
469 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
472 nested_comment :: P a -> P a
473 nested_comment cont buf = loop buf
476 case currentChar# buf of
477 '\NUL'# | bufferExhausted (stepOn buf) ->
478 lexError "unterminated `{-'" buf -- -}
479 '-'# | lookAhead# buf 1# `eqChar#` '}'# ->
480 cont (stepOnBy# buf 2#)
482 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
483 nested_comment (nested_comment cont) (stepOnBy# buf 2#)
485 '\n'# -> \ s@PState{loc=loc} ->
486 let buf' = stepOn buf in
487 nested_comment cont buf'
488 s{loc = incSrcLine loc, bol = currentIndex# buf',
491 _ -> nested_comment cont (stepOn buf)
493 -- When we are lexing the first token of a line, check whether we need to
494 -- insert virtual semicolons or close braces due to layout.
496 lexBOL :: (Token -> P a) -> P a
497 lexBOL cont buf s@(PState{
499 glasgow_exts = glaexts,
504 if need_close_curly then
505 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
506 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
507 else if need_semi_colon then
508 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
509 cont ITsemi buf s{atbol = 0#}
511 lexToken cont glaexts buf s{atbol = 0#}
513 col = currentIndex# buf -# bol
526 Layout n -> col ==# n
529 lexToken :: (Token -> P a) -> Int# -> P a
530 lexToken cont glaexts buf =
531 -- trace "lexToken" $
532 case currentChar# buf of
534 -- special symbols ----------------------------------------------------
535 '('# | flag glaexts && lookAhead# buf 1# `eqChar#` '#'#
536 -> cont IToubxparen (setCurrentPos# buf 2#)
538 -> cont IToparen (incLexeme buf)
540 ')'# -> cont ITcparen (incLexeme buf)
541 '['# -> cont ITobrack (incLexeme buf)
542 ']'# -> cont ITcbrack (incLexeme buf)
543 ','# -> cont ITcomma (incLexeme buf)
544 ';'# -> cont ITsemi (incLexeme buf)
545 '}'# -> \ s@PState{context = ctx} ->
547 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
548 _ -> lexError "too many '}'s" buf s
549 '|'# -> case lookAhead# buf 1# of
550 '}'# | flag glaexts -> cont ITccurlybar
551 (setCurrentPos# buf 2#)
552 _ -> lex_sym cont (incLexeme buf)
555 '#'# -> case lookAhead# buf 1# of
556 ')'# | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
557 '-'# -> case lookAhead# buf 2# of
558 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
559 _ -> lex_sym cont (incLexeme buf)
560 _ -> lex_sym cont (incLexeme buf)
562 '`'# | flag glaexts && lookAhead# buf 1# `eqChar#` '`'#
563 -> lex_cstring cont (setCurrentPos# buf 2#)
565 -> cont ITbackquote (incLexeme buf)
567 '{'# -> -- look for "{-##" special iface pragma
568 case lookAhead# buf 1# of
570 -> cont ITocurlybar (setCurrentPos# buf 2#)
571 '-'# -> case lookAhead# buf 2# of
572 '#'# -> case lookAhead# buf 3# of
575 = doDiscard False (stepOnBy# (stepOverLexeme buf) 4#) in
576 cont (ITpragma lexeme) buf'
577 _ -> lex_prag cont (setCurrentPos# buf 3#)
578 _ -> cont ITocurly (incLexeme buf)
579 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
581 -- strings/characters -------------------------------------------------
582 '\"'#{-"-} -> lex_string cont glaexts [] (incLexeme buf)
583 '\''# -> lex_char (char_end cont) glaexts (incLexeme buf)
585 -- strictness and cpr pragmas and __scc treated specially.
586 '_'# | flag glaexts ->
587 case lookAhead# buf 1# of
588 '_'# -> case lookAhead# buf 2# of
590 lex_demand cont (stepOnUntil (not . isSpace)
591 (stepOnBy# buf 3#)) -- past __S
593 cont ITcprinfo (stepOnBy# buf 3#) -- past __M
596 case prefixMatch (stepOnBy# buf 3#) "cc" of
597 Just buf' -> lex_scc cont (stepOverLexeme buf')
598 Nothing -> lex_id cont glaexts buf
599 _ -> lex_id cont glaexts buf
600 _ -> lex_id cont glaexts buf
602 -- Hexadecimal and octal constants
603 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
604 -> readNum (after_lexnum cont glaexts) buf' is_hexdigit 16 hex
605 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
606 -> readNum (after_lexnum cont glaexts) buf' is_octdigit 8 oct_or_dec
607 where ch = lookAhead# buf 1#
608 ch2 = lookAhead# buf 2#
609 buf' = setCurrentPos# buf 2#
612 if bufferExhausted (stepOn buf) then
615 trace "lexIface: misplaced NUL?" $
616 cont (ITunknown "\NUL") (stepOn buf)
618 '?'# | flag glaexts && is_lower (lookAhead# buf 1#) ->
619 lex_ip cont (incLexeme buf)
620 c | is_digit c -> lex_num cont glaexts 0 buf
621 | is_symbol c -> lex_sym cont buf
622 | is_upper c -> lex_con cont glaexts buf
623 | is_ident c -> lex_id cont glaexts buf
624 | otherwise -> lexError "illegal character" buf
626 -- Int# is unlifted, and therefore faster than Bool for flags.
632 -------------------------------------------------------------------------------
636 = case expandWhile# is_space buf of { buf1 ->
637 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
638 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
639 case lookupUFM pragmaKeywordsFM lexeme of
640 Just kw -> cont kw (mergeLexemes buf buf2)
641 Nothing -> panic "lex_prag"
644 -------------------------------------------------------------------------------
647 lex_string cont glaexts s buf
648 = case currentChar# buf of
650 let buf' = incLexeme buf; s' = mkFastStringInt (reverse s) in
651 case currentChar# buf' of
652 '#'# | flag glaexts -> if all (<= 0xFF) s
653 then cont (ITprimstring s') (incLexeme buf')
654 else lexError "primitive string literal must contain only characters <= '\xFF'" buf'
655 _ -> cont (ITstring s') buf'
657 -- ignore \& in a string, deal with string gaps
658 '\\'# | next_ch `eqChar#` '&'#
659 -> lex_string cont glaexts s buf'
661 -> lex_stringgap cont glaexts s (incLexeme buf)
663 where next_ch = lookAhead# buf 1#
664 buf' = setCurrentPos# buf 2#
666 _ -> lex_char (lex_next_string cont s) glaexts buf
668 lex_stringgap cont glaexts s buf
669 = let buf' = incLexeme buf in
670 case currentChar# buf of
671 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont glaexts s buf'
672 st{loc = incSrcLine loc}
673 '\\'# -> lex_string cont glaexts s buf'
674 c | is_space c -> lex_stringgap cont glaexts s buf'
675 other -> charError buf'
677 lex_next_string cont s glaexts c buf = lex_string cont glaexts (c:s) buf
679 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
680 lex_char cont glaexts buf
681 = case currentChar# buf of
682 '\\'# -> lex_escape (cont glaexts) (incLexeme buf)
683 c | is_any c -> cont glaexts (I# (ord# c)) (incLexeme buf)
684 other -> charError buf
686 char_end cont glaexts c buf
687 = case currentChar# buf of
688 '\''# -> let buf' = incLexeme buf in
689 case currentChar# buf' of
691 -> cont (ITprimchar c) (incLexeme buf')
692 _ -> cont (ITchar c) buf'
696 = let buf' = incLexeme buf in
697 case currentChar# buf of
698 'a'# -> cont (ord '\a') buf'
699 'b'# -> cont (ord '\b') buf'
700 'f'# -> cont (ord '\f') buf'
701 'n'# -> cont (ord '\n') buf'
702 'r'# -> cont (ord '\r') buf'
703 't'# -> cont (ord '\t') buf'
704 'v'# -> cont (ord '\v') buf'
705 '\\'# -> cont (ord '\\') buf'
706 '"'# -> cont (ord '\"') buf'
707 '\''# -> cont (ord '\'') buf'
708 '^'# -> let c = currentChar# buf' in
709 if c `geChar#` '@'# && c `leChar#` '_'#
710 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
713 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
714 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
716 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
718 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
719 Just buf2 <- [prefixMatch buf p] ] of
720 (c,buf2):_ -> cont (ord c) buf2
723 after_charnum cont i buf
724 = if i >= 0 && i <= 0x7FFFFFFF
725 then cont (fromInteger i) buf
728 readNum cont buf is_digit base conv = read buf 0
730 = case currentChar# buf of { c ->
732 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
738 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
739 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
741 hex c | is_digit c = ord# c -# ord# '0'#
742 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
743 oct_or_dec c = ord# c -# ord# '0'#
745 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
748 | c `geChar#` 'A'# && c `leChar#` 'Z'#
749 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
752 charError buf = lexError "error in character literal" buf
754 silly_escape_chars = [
791 -------------------------------------------------------------------------------
793 lex_demand cont buf =
794 case read_em [] buf of { (ls,buf') ->
795 case currentChar# buf' of
796 'B'# -> cont (ITstrict (ls, True )) (incLexeme buf')
797 _ -> cont (ITstrict (ls, False)) buf'
800 -- code snatched from Demand.lhs
802 case currentChar# buf of
803 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
804 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
805 'S'# -> read_em (WwStrict : acc) (stepOn buf)
806 'P'# -> read_em (WwPrim : acc) (stepOn buf)
807 'E'# -> read_em (WwEnum : acc) (stepOn buf)
808 ')'# -> (reverse acc, stepOn buf)
809 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
810 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
811 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
812 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
813 _ -> (reverse acc, buf)
815 do_unpack new_or_data wrapper_unpacks acc buf
816 = case read_em [] buf of
817 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
822 case currentChar# buf of
823 'C'# -> cont ITsccAllCafs (incLexeme buf)
824 other -> cont ITscc buf
826 -----------------------------------------------------------------------------
829 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
830 lex_num cont glaexts acc buf =
831 case scanNumLit acc buf of
833 case currentChar# buf' of
834 '.'# | is_digit (lookAhead# buf' 1#) ->
835 -- this case is not optimised at all, as the
836 -- presence of floating point numbers in interface
837 -- files is not that common. (ToDo)
838 case expandWhile# is_digit (incLexeme buf') of
839 buf2 -> -- points to first non digit char
841 let l = case currentChar# buf2 of
847 = let buf3 = incLexeme buf2 in
848 case currentChar# buf3 of
849 '-'# -> expandWhile# is_digit (incLexeme buf3)
850 '+'# -> expandWhile# is_digit (incLexeme buf3)
851 x | is_digit x -> expandWhile# is_digit buf3
854 v = readRational__ (lexemeToString l)
856 in case currentChar# l of -- glasgow exts only
857 '#'# | flag glaexts -> let l' = incLexeme l in
858 case currentChar# l' of
859 '#'# -> cont (ITprimdouble v) (incLexeme l')
860 _ -> cont (ITprimfloat v) l'
861 _ -> cont (ITrational v) l
863 _ -> after_lexnum cont glaexts acc' buf'
865 after_lexnum cont glaexts i buf
866 = case currentChar# buf of
867 '#'# | flag glaexts -> cont (ITprimint i) (incLexeme buf)
868 _ -> cont (ITinteger i) buf
870 -----------------------------------------------------------------------------
871 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
873 -- we lexemeToFastString on the bit between the ``''s, but include the
874 -- quotes in the full lexeme.
876 lex_cstring cont buf =
877 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
878 Just buf' -> cont (ITlitlit (lexemeToFastString
879 (setCurrentPos# buf' (negateInt# 2#))))
880 (mergeLexemes buf buf')
881 Nothing -> lexError "unterminated ``" buf
883 -----------------------------------------------------------------------------
884 -- identifiers, symbols etc.
887 case expandWhile# is_ident buf of
888 buf' -> cont (ITipvarid lexeme) buf'
889 where lexeme = lexemeToFastString buf'
891 lex_id cont glaexts buf =
892 let buf1 = expandWhile# is_ident buf in
895 case (if flag glaexts
896 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
897 else buf1) of { buf' ->
899 let lexeme = lexemeToFastString buf' in
901 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
902 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
906 let var_token = cont (ITvarid lexeme) buf' in
908 if not (flag glaexts)
912 case lookupUFM ghcExtensionKeywordsFM lexeme of {
913 Just kwd_token -> cont kwd_token buf';
920 case expandWhile# is_symbol buf of
921 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
922 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
923 cont kwd_token buf' ;
924 Nothing -> --trace ("sym: "++unpackFS lexeme) $
925 cont (mk_var_token lexeme) buf'
927 where lexeme = lexemeToFastString buf'
930 lex_con cont glaexts buf =
931 -- trace ("con: "{-++unpackFS lexeme-}) $
932 case expandWhile# is_ident buf of { buf1 ->
933 case slurp_trailing_hashes buf1 glaexts of { buf' ->
935 case currentChar# buf' of
940 just_a_conid = cont (ITconid lexeme) buf'
941 lexeme = lexemeToFastString buf'
942 munch = lex_qid cont glaexts lexeme (incLexeme buf') just_a_conid
945 lex_qid cont glaexts mod buf just_a_conid =
946 -- trace ("quid: "{-++unpackFS lexeme-}) $
947 case currentChar# buf of
948 '['# -> -- Special case for []
949 case lookAhead# buf 1# of
950 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
953 '('# -> -- Special case for (,,,)
954 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
955 case lookAhead# buf 1# of
956 '#'# | flag glaexts -> case lookAhead# buf 2# of
957 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
960 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
961 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
964 '-'# -> case lookAhead# buf 1# of
965 '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
966 _ -> lex_id3 cont glaexts mod buf just_a_conid
967 _ -> lex_id3 cont glaexts mod buf just_a_conid
969 lex_id3 cont glaexts mod buf just_a_conid
970 | is_symbol (currentChar# buf) =
972 start_new_lexeme = stepOverLexeme buf
974 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
975 case expandWhile# is_symbol start_new_lexeme of { buf' ->
977 lexeme = lexemeToFastString buf'
978 -- real lexeme is M.<sym>
979 new_buf = mergeLexemes buf buf'
981 cont (mk_qvar_token mod lexeme) new_buf
982 -- wrong, but arguably morally right: M... is now a qvarsym
987 start_new_lexeme = stepOverLexeme buf
989 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
990 case expandWhile# is_ident start_new_lexeme of { buf1 ->
995 case slurp_trailing_hashes buf1 glaexts of { buf' ->
998 lexeme = lexemeToFastString buf'
999 new_buf = mergeLexemes buf buf'
1000 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1002 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1003 Just kwd_token -> just_a_conid; -- avoid M.where etc.
1004 Nothing -> is_a_qvarid
1005 -- TODO: special ids (as, qualified, hiding) shouldn't be
1006 -- recognised as keywords here. ie. M.as is a qualified varid.
1010 slurp_trailing_hashes buf glaexts
1011 | flag glaexts = expandWhile# (`eqChar#` '#'#) buf
1016 | is_upper f = ITconid pk_str
1017 | is_ident f = ITvarid pk_str
1018 | f `eqChar#` ':'# = ITconsym pk_str
1019 | otherwise = ITvarsym pk_str
1021 (C# f) = _HEAD_ pk_str
1022 -- tl = _TAIL_ pk_str
1024 mk_qvar_token m token =
1025 -- trace ("mk_qvar ") $
1026 case mk_var_token token of
1027 ITconid n -> ITqconid (m,n)
1028 ITvarid n -> ITqvarid (m,n)
1029 ITconsym n -> ITqconsym (m,n)
1030 ITvarsym n -> ITqvarsym (m,n)
1031 _ -> ITunknown (show token)
1034 ----------------------------------------------------------------------------
1035 Horrible stuff for dealing with M.(,,,)
1038 lex_tuple cont mod buf back_off =
1042 case currentChar# buf of
1043 ','# -> go (n+1) (stepOn buf)
1044 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1047 lex_ubx_tuple cont mod buf back_off =
1051 case currentChar# buf of
1052 ','# -> go (n+1) (stepOn buf)
1053 '#'# -> case lookAhead# buf 1# of
1054 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1060 -----------------------------------------------------------------------------
1061 doDiscard rips along really fast, looking for a '#-}',
1062 indicating the end of the pragma we're skipping
1065 doDiscard inStr buf =
1066 case currentChar# buf of
1068 case lookAhead# buf 1# of { '#'# ->
1069 case lookAhead# buf 2# of { '-'# ->
1070 case lookAhead# buf 3# of { '}'# ->
1071 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
1072 _ -> doDiscard inStr (incLexeme buf) };
1073 _ -> doDiscard inStr (incLexeme buf) };
1074 _ -> doDiscard inStr (incLexeme buf) }
1077 odd_slashes buf flg i# =
1078 case lookAhead# buf i# of
1079 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1082 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1083 '\\'# -> -- escaping something..
1084 if odd_slashes buf True (negateInt# 2#) then
1085 -- odd number of slashes, " is escaped.
1086 doDiscard inStr (incLexeme buf)
1088 -- even number of slashes, \ is escaped.
1089 doDiscard (not inStr) (incLexeme buf)
1090 _ -> case inStr of -- forced to avoid build-up
1091 True -> doDiscard False (incLexeme buf)
1092 False -> doDiscard True (incLexeme buf)
1093 _ -> doDiscard inStr (incLexeme buf)
1097 -----------------------------------------------------------------------------
1108 data PState = PState {
1110 glasgow_exts :: Int#,
1113 context :: [LayoutContext]
1116 type P a = StringBuffer -- Input string
1121 returnP a buf s = POk s a
1123 thenP :: P a -> (a -> P b) -> P b
1124 m `thenP` k = \ buf s ->
1126 POk s1 a -> k a buf s1
1127 PFailed err -> PFailed err
1129 thenP_ :: P a -> P b -> P b
1130 m `thenP_` k = m `thenP` \_ -> k
1132 mapP :: (a -> P b) -> [a] -> P [b]
1133 mapP f [] = returnP []
1136 mapP f as `thenP` \bs ->
1139 failP :: String -> P a
1140 failP msg buf s = PFailed (text msg)
1142 failMsgP :: Message -> P a
1143 failMsgP msg buf s = PFailed msg
1145 lexError :: String -> P a
1146 lexError str buf s@PState{ loc = loc }
1147 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1149 getSrcLocP :: P SrcLoc
1150 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1152 getSrcFile :: P FAST_STRING
1153 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1155 getContext :: P [LayoutContext]
1156 getContext buf s@(PState{ context = ctx }) = POk s ctx
1158 pushContext :: LayoutContext -> P ()
1159 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1163 This special case in layoutOn is to handle layout contexts with are
1164 indented the same or less than the current context. This is illegal
1165 according to the Haskell spec, so we have to arrange to close the
1166 current context. eg.
1171 after the first 'where', the sequence of events is:
1173 - layout system inserts a ';' (column 0)
1174 - parser begins a new context at column 0
1175 - parser shifts ';' (legal empty declaration)
1176 - parser sees 'class': parse error (we're still in the inner context)
1178 trouble is, by the time we know we need a new context, the lexer has
1179 already generated the ';'. Hacky solution is as follows: since we
1180 know the column of the next token (it's the column number of the new
1181 context), we set the ACTUAL column number of the new context to this
1182 numer plus one. Hence the next time the lexer is called, a '}' will
1183 be generated to close the new context straight away. Furthermore, we
1184 have to set the atbol flag so that the ';' that the parser shifted as
1185 part of the new context is re-generated.
1187 when the new context is *less* indented than the current one:
1189 f = f where g = g where
1192 - current context: column 12.
1193 - on seeing 'h' (column 0), the layout system inserts '}'
1194 - parser starts a new context, column 0
1195 - parser sees '}', uses it to close new context
1196 - we still need to insert another '}' followed by a ';',
1197 hence the atbol trick.
1199 There's also a special hack in here to deal with
1206 i.e. the inner context is at the same indentation level as the outer
1207 context. This is strictly illegal according to Haskell 98, but
1208 there's a lot of existing code using this style and it doesn't make
1209 any sense to disallow it, since empty 'do' lists don't make sense.
1212 layoutOn :: Bool -> P ()
1213 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1214 let offset = lexemeIndex buf -# bol in
1217 | if strict then prev_off >=# offset else prev_off ># offset ->
1218 --trace ("layout on, column: " ++ show (I# offset)) $
1219 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1221 --trace ("layout on, column: " ++ show (I# offset)) $
1222 POk s{ context = Layout offset : ctx } ()
1225 layoutOff buf s@(PState{ context = ctx }) =
1226 POk s{ context = NoLayout:ctx } ()
1229 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1231 (_:tl) -> POk s{ context = tl } ()
1232 [] -> PFailed (srcParseErr buf loc)
1235 Note that if the name of the file we're processing ends
1236 with `hi-boot', we accept it on faith as having the right
1237 version. This is done so that .hi-boot files that comes
1238 with hsc don't have to be updated before every release,
1239 *and* it allows us to share .hi-boot files with versions
1240 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1242 If the version number is 0, the checking is also turned off.
1243 (needed to deal with GHC.hi only!)
1245 Once we can assume we're compiling with a version of ghc that
1246 supports interface file checking, we can drop the special
1249 checkVersion :: Maybe Integer -> P ()
1250 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1251 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1252 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1253 checkVersion mb@Nothing buf s@(PState{loc = loc})
1254 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1255 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1257 -----------------------------------------------------------------
1259 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1261 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1262 text (lexemeToString s), char '\'']
1264 ifaceVersionErr hi_vers l toks
1265 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1266 ptext SLIT("Expected"), int opt_HiVersion,
1267 ptext SLIT("found "), pp_version]
1271 Nothing -> ptext SLIT("pre ghc-3.02 version")
1272 Just v -> ptext SLIT("version") <+> integer v
1274 -----------------------------------------------------------------------------
1276 srcParseErr :: StringBuffer -> SrcLoc -> Message
1280 then ptext SLIT(": parse error (possibly incorrect indentation)")
1281 else hcat [ptext SLIT(": parse error on input "),
1282 char '`', text token, char '\'']
1285 token = lexemeToString s