2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
22 ifaceParseErr, srcParseErr,
25 Token(..), lexer, ParseResult(..), PState(..),
29 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
30 getSrcLocP, getSrcFile,
31 layoutOn, layoutOff, pushContext, popContext
34 #include "HsVersions.h"
36 import Char ( isSpace, toUpper )
37 import List ( isSuffixOf )
39 import IdInfo ( InlinePragInfo(..) )
40 import PrelNames ( mkTupNameStr )
41 import CmdLineOpts ( opt_HiVersion, opt_NoHiCheck )
42 import Demand ( Demand(..) {- instance Read -} )
43 import UniqFM ( listToUFM, lookupUFM )
44 import BasicTypes ( NewOrData(..), Boxity(..) )
45 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
46 replaceSrcLine, mkSrcLoc )
48 import ErrUtils ( Message )
56 import PrelRead ( readRational__ ) -- Glasgow non-std
59 %************************************************************************
61 \subsection{Data types}
63 %************************************************************************
65 The token data type, fairly un-interesting except from one
66 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
67 strictness, unfolding etc).
69 The Idea/Observation here is that the renamer needs to scan through
70 all of an interface file before it can continue. But only a fraction
71 of the information contained in the file turns out to be useful, so
72 delaying as much as possible of the scanning and parsing of an
73 interface file Makes Sense (Heap profiles of the compiler
74 show a reduction in heap usage by at least a factor of two,
77 Hence, the interface file lexer spots when value declarations are
78 being scanned and return the @ITidinfo@ and @ITtype@ constructors
79 for the type and any other id info for that binding (unfolding, strictness
80 etc). These constructors are applied to the result of lexing these sub-chunks.
82 The lexing of the type and id info is all done lazily, of course, so
83 the scanning (and subsequent parsing) will be done *only* on the ids the
84 renamer finds out that it is interested in. The rest will just be junked.
85 Laziness, you know it makes sense :-)
89 = ITas -- Haskell keywords
115 | ITforall -- GHC extension keywords
125 | ITinterface -- interface keywords
133 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
151 | ITunfold InlinePragInfo
152 | ITstrict ([Demand], Bool)
159 | ITspecialise_prag -- Pragmas
168 | ITdotdot -- reserved symbols
182 | ITbiglam -- GHC-extension symbols
184 | ITocurly -- special symbols
198 | ITvarid FAST_STRING -- identifiers
199 | ITconid FAST_STRING
200 | ITvarsym FAST_STRING
201 | ITconsym FAST_STRING
202 | ITqvarid (FAST_STRING,FAST_STRING)
203 | ITqconid (FAST_STRING,FAST_STRING)
204 | ITqvarsym (FAST_STRING,FAST_STRING)
205 | ITqconsym (FAST_STRING,FAST_STRING)
207 | ITipvarid FAST_STRING -- GHC extension: implicit param: ?x
209 | ITpragma StringBuffer
212 | ITstring FAST_STRING
214 | ITrational Rational
217 | ITprimstring FAST_STRING
219 | ITprimfloat Rational
220 | ITprimdouble Rational
221 | ITlitlit FAST_STRING
223 | ITunknown String -- Used when the lexer can't make sense of it
224 | ITeof -- end of file token
225 deriving Show -- debugging
228 -----------------------------------------------------------------------------
232 pragmaKeywordsFM = listToUFM $
233 map (\ (x,y) -> (_PK_ x,y))
234 [( "SPECIALISE", ITspecialise_prag ),
235 ( "SPECIALIZE", ITspecialise_prag ),
236 ( "SOURCE", ITsource_prag ),
237 ( "INLINE", ITinline_prag ),
238 ( "NOINLINE", ITnoinline_prag ),
239 ( "NOTINLINE", ITnoinline_prag ),
240 ( "LINE", ITline_prag ),
241 ( "RULES", ITrules_prag ),
242 ( "RULEZ", ITrules_prag ), -- american spelling :-)
243 ( "DEPRECATED", ITdeprecated_prag )
246 haskellKeywordsFM = listToUFM $
247 map (\ (x,y) -> (_PK_ x,y))
248 [( "_", ITunderscore ),
251 ( "class", ITclass ),
253 ( "default", ITdefault ),
254 ( "deriving", ITderiving ),
257 ( "hiding", IThiding ),
259 ( "import", ITimport ),
261 ( "infix", ITinfix ),
262 ( "infixl", ITinfixl ),
263 ( "infixr", ITinfixr ),
264 ( "instance", ITinstance ),
266 ( "module", ITmodule ),
267 ( "newtype", ITnewtype ),
269 ( "qualified", ITqualified ),
272 ( "where", ITwhere ),
276 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
277 ghcExtensionKeywordsFM = listToUFM $
278 map (\ (x,y) -> (_PK_ x,y))
279 [ ( "forall", ITforall ),
280 ( "foreign", ITforeign ),
281 ( "export", ITexport ),
282 ( "label", ITlabel ),
283 ( "dynamic", ITdynamic ),
284 ( "unsafe", ITunsafe ),
286 ( "stdcall", ITstdcallconv),
287 ( "ccall", ITccallconv),
288 ("_ccall_", ITccall (False, False, False)),
289 ("_ccall_GC_", ITccall (False, False, True)),
290 ("_casm_", ITccall (False, True, False)),
291 ("_casm_GC_", ITccall (False, True, True)),
293 -- interface keywords
294 ("__interface", ITinterface),
295 ("__export", IT__export),
296 ("__depends", ITdepends),
297 ("__forall", IT__forall),
298 ("__letrec", ITletrec),
299 ("__coerce", ITcoerce),
300 ("__inline_me", ITinlineMe),
301 ("__inline_call", ITinlineCall),
302 ("__depends", ITdepends),
303 ("__DEFAULT", ITdefaultbranch),
305 ("__integer", ITinteger_lit),
306 ("__float", ITfloat_lit),
307 ("__int64", ITint64_lit),
308 ("__word", ITword_lit),
309 ("__word64", ITword64_lit),
310 ("__rational", ITrational_lit),
311 ("__addr", ITaddr_lit),
312 ("__litlit", ITlit_lit),
313 ("__string", ITstring_lit),
316 ("__fuall", ITfuall),
318 ("__P", ITspecialise),
321 ("__D", ITdeprecated),
322 ("__U", ITunfold NoInlinePragInfo),
324 ("__ccall", ITccall (False, False, False)),
325 ("__ccall_GC", ITccall (False, False, True)),
326 ("__dyn_ccall", ITccall (True, False, False)),
327 ("__dyn_ccall_GC", ITccall (True, False, True)),
328 ("__casm", ITccall (False, True, False)),
329 ("__dyn_casm", ITccall (True, True, False)),
330 ("__casm_GC", ITccall (False, True, True)),
331 ("__dyn_casm_GC", ITccall (True, True, True)),
337 haskellKeySymsFM = listToUFM $
338 map (\ (x,y) -> (_PK_ x,y))
351 ,(".", ITdot) -- sadly, for 'forall a . t'
355 -----------------------------------------------------------------------------
360 - (glaexts) lexing an interface file or -fglasgow-exts
361 - (bol) pointer to beginning of line (for column calculations)
362 - (buf) pointer to beginning of token
363 - (buf) pointer to current char
364 - (atbol) flag indicating whether we're at the beginning of a line
367 lexer :: (Token -> P a) -> P a
368 lexer cont buf s@(PState{
370 glasgow_exts = glaexts,
376 -- first, start a new lexeme and lose all the whitespace
378 tab line bol atbol (stepOverLexeme buf)
380 line = srcLocLine loc
382 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
383 case currentChar# buf of
386 if bufferExhausted (stepOn buf)
387 then cont ITeof buf s'
388 else trace "lexer: misplaced NUL?" $
389 tab y bol atbol (stepOn buf)
391 '\n'# -> let buf' = stepOn buf
392 in tab (y +# 1#) (currentIndex# buf') 1# buf'
394 -- find comments. This got harder in Haskell 98.
395 '-'# -> let trundle n =
396 let next = lookAhead# buf n in
397 if next `eqChar#` '-'# then trundle (n +# 1#)
398 else if is_symbol next || n <# 2#
401 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
404 -- comments and pragmas. We deal with LINE pragmas here,
405 -- and throw out any unrecognised pragmas as comments. Any
406 -- pragmas we know about are dealt with later (after any layout
407 -- processing if necessary).
409 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
410 if lookAhead# buf 2# `eqChar#` '#'# then
411 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
412 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
413 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
414 let lexeme = mkFastString -- ToDo: too slow
415 (map toUpper (lexemeToString buf2)) in
416 case lookupUFM pragmaKeywordsFM lexeme of
418 line_prag skip_to_end buf2 s'
419 Just other -> is_a_token
420 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
423 else skip_to_end (stepOnBy# buf 2#) s'
425 skip_to_end = nested_comment (lexer cont)
427 -- special GHC extension: we grok cpp-style #line pragmas
428 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
429 line_prag next_line (stepOn buf) s'
431 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
433 -- tabs have been expanded beforehand
434 c | is_space c -> tab y bol atbol (stepOn buf)
435 | otherwise -> is_a_token
437 where s' = s{loc = replaceSrcLine loc y,
441 is_a_token | atbol /=# 0# = lexBOL cont buf s'
442 | otherwise = lexToken cont glaexts buf s'
444 -- {-# LINE .. #-} pragmas. yeuch.
445 line_prag cont buf s@PState{loc=loc} =
446 case expandWhile# is_space buf of { buf1 ->
447 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
448 -- subtract one: the line number refers to the *following* line.
449 let real_line = line - 1 in
450 case fromInteger real_line of { i@(I# l) ->
451 -- ToDo, if no filename then we skip the newline.... d'oh
452 case expandWhile# is_space buf2 of { buf3 ->
453 case currentChar# buf3 of
455 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
457 file = lexemeToFastString buf4
458 new_buf = stepOn (stepOverLexeme buf4)
460 if nullFastString file
461 then cont new_buf s{loc = replaceSrcLine loc l}
462 else cont new_buf s{loc = mkSrcLoc file i}
464 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
467 nested_comment :: P a -> P a
468 nested_comment cont buf = loop buf
471 case currentChar# buf of
472 '\NUL'# | bufferExhausted (stepOn buf) ->
473 lexError "unterminated `{-'" buf
475 '-'# | lookAhead# buf 1# `eqChar#` '}'# ->
476 cont (stepOnBy# buf 2#)
478 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
479 nested_comment (nested_comment cont) (stepOnBy# buf 2#)
481 '\n'# -> \ s@PState{loc=loc} ->
482 let buf' = stepOn buf in
483 nested_comment cont buf'
484 s{loc = incSrcLine loc, bol = currentIndex# buf',
487 _ -> nested_comment cont (stepOn buf)
489 -- When we are lexing the first token of a line, check whether we need to
490 -- insert virtual semicolons or close braces due to layout.
492 lexBOL :: (Token -> P a) -> P a
493 lexBOL cont buf s@(PState{
495 glasgow_exts = glaexts,
500 if need_close_curly then
501 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
502 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
503 else if need_semi_colon then
504 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
505 cont ITsemi buf s{atbol = 0#}
507 lexToken cont glaexts buf s{atbol = 0#}
509 col = currentIndex# buf -# bol
522 Layout n -> col ==# n
525 lexToken :: (Token -> P a) -> Int# -> P a
526 lexToken cont glaexts buf =
528 case currentChar# buf of
530 -- special symbols ----------------------------------------------------
531 '('# | flag glaexts && lookAhead# buf 1# `eqChar#` '#'#
532 -> cont IToubxparen (setCurrentPos# buf 2#)
534 -> cont IToparen (incLexeme buf)
536 ')'# -> cont ITcparen (incLexeme buf)
537 '['# -> cont ITobrack (incLexeme buf)
538 ']'# -> cont ITcbrack (incLexeme buf)
539 ','# -> cont ITcomma (incLexeme buf)
540 ';'# -> cont ITsemi (incLexeme buf)
542 '}'# -> \ s@PState{context = ctx} ->
544 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
545 _ -> lexError "too many '}'s" buf s
547 '#'# -> case lookAhead# buf 1# of
548 ')'# | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
549 '-'# -> case lookAhead# buf 2# of
550 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
551 _ -> lex_sym cont (incLexeme buf)
552 _ -> lex_sym cont (incLexeme buf)
554 '`'# | flag glaexts && lookAhead# buf 1# `eqChar#` '`'#
555 -> lex_cstring cont (setCurrentPos# buf 2#)
557 -> cont ITbackquote (incLexeme buf)
559 '{'# -> -- look for "{-##" special iface pragma
560 case lookAhead# buf 1# of
561 '-'# -> case lookAhead# buf 2# of
562 '#'# -> case lookAhead# buf 3# of
565 = doDiscard False (stepOnBy# (stepOverLexeme buf) 4#) in
566 cont (ITpragma lexeme) buf'
567 _ -> lex_prag cont (setCurrentPos# buf 3#)
568 _ -> cont ITocurly (incLexeme buf)
569 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
571 -- strings/characters -------------------------------------------------
572 '\"'#{-"-} -> lex_string cont glaexts "" (incLexeme buf)
573 '\''# -> lex_char (char_end cont) glaexts (incLexeme buf)
575 -- strictness and cpr pragmas and __scc treated specially.
576 '_'# | flag glaexts ->
577 case lookAhead# buf 1# of
578 '_'# -> case lookAhead# buf 2# of
580 lex_demand cont (stepOnUntil (not . isSpace)
581 (stepOnBy# buf 3#)) -- past __S
583 cont ITcprinfo (stepOnBy# buf 3#) -- past __M
586 case prefixMatch (stepOnBy# buf 3#) "cc" of
587 Just buf' -> lex_scc cont (stepOverLexeme buf')
588 Nothing -> lex_id cont glaexts buf
589 _ -> lex_id cont glaexts buf
590 _ -> lex_id cont glaexts buf
592 -- Hexadecimal and octal constants
593 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
594 -> readNum (after_lexnum cont glaexts) buf' is_hexdigit 16 hex
595 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
596 -> readNum (after_lexnum cont glaexts) buf' is_octdigit 8 oct_or_dec
597 where ch = lookAhead# buf 1#
598 ch2 = lookAhead# buf 2#
599 buf' = setCurrentPos# buf 2#
602 if bufferExhausted (stepOn buf) then
605 trace "lexIface: misplaced NUL?" $
606 cont (ITunknown "\NUL") (stepOn buf)
608 '?'# | flag glaexts && is_lower (lookAhead# buf 1#) ->
609 lex_ip cont (incLexeme buf)
610 c | is_digit c -> lex_num cont glaexts 0 buf
611 | is_symbol c -> lex_sym cont buf
612 | is_upper c -> lex_con cont glaexts buf
613 | is_ident c -> lex_id cont glaexts buf
614 | otherwise -> lexError "illegal character" buf
616 -- Int# is unlifted, and therefore faster than Bool for flags.
622 -------------------------------------------------------------------------------
626 = case expandWhile# is_space buf of { buf1 ->
627 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
628 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
629 case lookupUFM pragmaKeywordsFM lexeme of
630 Just kw -> cont kw (mergeLexemes buf buf2)
631 Nothing -> panic "lex_prag"
634 -------------------------------------------------------------------------------
637 lex_string cont glaexts s buf
638 = case currentChar# buf of
640 let buf' = incLexeme buf; s' = mkFastString (reverse s) in
641 case currentChar# buf' of
642 '#'# | flag glaexts -> cont (ITprimstring s') (incLexeme buf')
643 _ -> cont (ITstring s') buf'
645 -- ignore \& in a string, deal with string gaps
646 '\\'# | next_ch `eqChar#` '&'#
647 -> lex_string cont glaexts s buf'
649 -> lex_stringgap cont glaexts s (incLexeme buf)
651 where next_ch = lookAhead# buf 1#
652 buf' = setCurrentPos# buf 2#
654 _ -> lex_char (lex_next_string cont s) glaexts buf
656 lex_stringgap cont glaexts s buf
657 = let buf' = incLexeme buf in
658 case currentChar# buf of
659 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont glaexts s buf'
660 st{loc = incSrcLine loc}
661 '\\'# -> lex_string cont glaexts s buf'
662 c | is_space c -> lex_stringgap cont glaexts s buf'
663 other -> charError buf'
665 lex_next_string cont s glaexts c buf = lex_string cont glaexts (c:s) buf
667 lex_char :: (Int# -> Char -> P a) -> Int# -> P a
668 lex_char cont glaexts buf
669 = case currentChar# buf of
670 '\\'# -> lex_escape (cont glaexts) (incLexeme buf)
671 c | is_any c -> cont glaexts (C# c) (incLexeme buf)
672 other -> charError buf
674 char_end cont glaexts c buf
675 = case currentChar# buf of
676 '\''# -> let buf' = incLexeme buf in
677 case currentChar# buf' of
679 -> cont (ITprimchar c) (incLexeme buf')
680 _ -> cont (ITchar c) buf'
684 = let buf' = incLexeme buf in
685 case currentChar# buf of
686 'a'# -> cont '\a' buf'
687 'b'# -> cont '\b' buf'
688 'f'# -> cont '\f' buf'
689 'n'# -> cont '\n' buf'
690 'r'# -> cont '\r' buf'
691 't'# -> cont '\t' buf'
692 'v'# -> cont '\v' buf'
693 '\\'# -> cont '\\' buf'
694 '"'# -> cont '\"' buf'
695 '\''# -> cont '\'' buf'
696 '^'# -> let c = currentChar# buf' in
697 if c `geChar#` '@'# && c `leChar#` '_'#
698 then cont (C# (chr# (ord# c -# ord# '@'#))) (incLexeme buf')
701 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
702 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
704 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
706 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
707 Just buf2 <- [prefixMatch buf p] ] of
708 (c,buf2):_ -> cont c buf2
711 after_charnum cont i buf
712 = let int = fromInteger i in
713 if i >= 0 && i <= 255
714 then cont (chr int) buf
717 readNum cont buf is_digit base conv = read buf 0
719 = case currentChar# buf of { c ->
721 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
727 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
728 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
730 hex c | is_digit c = ord# c -# ord# '0'#
731 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
732 oct_or_dec c = ord# c -# ord# '0'#
734 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
737 | c `geChar#` 'A'# && c `leChar#` 'Z'#
738 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
741 charError buf = lexError "error in character literal" buf
743 silly_escape_chars = [
780 -------------------------------------------------------------------------------
782 lex_demand cont buf =
783 case read_em [] buf of { (ls,buf') ->
784 case currentChar# buf' of
785 'B'# -> cont (ITstrict (ls, True )) (incLexeme buf')
786 _ -> cont (ITstrict (ls, False)) buf'
789 -- code snatched from Demand.lhs
791 case currentChar# buf of
792 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
793 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
794 'S'# -> read_em (WwStrict : acc) (stepOn buf)
795 'P'# -> read_em (WwPrim : acc) (stepOn buf)
796 'E'# -> read_em (WwEnum : acc) (stepOn buf)
797 ')'# -> (reverse acc, stepOn buf)
798 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
799 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
800 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
801 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
802 _ -> (reverse acc, buf)
804 do_unpack new_or_data wrapper_unpacks acc buf
805 = case read_em [] buf of
806 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
811 case currentChar# buf of
812 'C'# -> cont ITsccAllCafs (incLexeme buf)
813 other -> cont ITscc buf
815 -----------------------------------------------------------------------------
818 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
819 lex_num cont glaexts acc buf =
820 case scanNumLit acc buf of
822 case currentChar# buf' of
823 '.'# | is_digit (lookAhead# buf' 1#) ->
824 -- this case is not optimised at all, as the
825 -- presence of floating point numbers in interface
826 -- files is not that common. (ToDo)
827 case expandWhile# is_digit (incLexeme buf') of
828 buf2 -> -- points to first non digit char
830 let l = case currentChar# buf2 of
836 = let buf3 = incLexeme buf2 in
837 case currentChar# buf3 of
838 '-'# -> expandWhile# is_digit (incLexeme buf3)
839 '+'# -> expandWhile# is_digit (incLexeme buf3)
840 x | is_digit x -> expandWhile# is_digit buf3
843 v = readRational__ (lexemeToString l)
845 in case currentChar# l of -- glasgow exts only
846 '#'# | flag glaexts -> let l' = incLexeme l in
847 case currentChar# l' of
848 '#'# -> cont (ITprimdouble v) (incLexeme l')
849 _ -> cont (ITprimfloat v) l'
850 _ -> cont (ITrational v) l
852 _ -> after_lexnum cont glaexts acc' buf'
854 after_lexnum cont glaexts i buf
855 = case currentChar# buf of
856 '#'# | flag glaexts -> cont (ITprimint i) (incLexeme buf)
857 _ -> cont (ITinteger i) buf
859 -----------------------------------------------------------------------------
860 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
862 -- we lexemeToFastString on the bit between the ``''s, but include the
863 -- quotes in the full lexeme.
865 lex_cstring cont buf =
866 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
867 Just buf' -> cont (ITlitlit (lexemeToFastString
868 (setCurrentPos# buf' (negateInt# 2#))))
869 (mergeLexemes buf buf')
870 Nothing -> lexError "unterminated ``" buf
872 -----------------------------------------------------------------------------
873 -- identifiers, symbols etc.
876 case expandWhile# is_ident buf of
877 buf' -> cont (ITipvarid lexeme) buf'
878 where lexeme = lexemeToFastString buf'
880 lex_id cont glaexts buf =
881 let buf1 = expandWhile# is_ident buf in
884 case (if flag glaexts
885 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
886 else buf1) of { buf' ->
888 let lexeme = lexemeToFastString buf' in
890 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
891 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
895 let var_token = cont (ITvarid lexeme) buf' in
897 if not (flag glaexts)
901 case lookupUFM ghcExtensionKeywordsFM lexeme of {
902 Just kwd_token -> cont kwd_token buf';
908 case expandWhile# is_symbol buf of
909 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
910 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
911 cont kwd_token buf' ;
912 Nothing -> --trace ("sym: "++unpackFS lexeme) $
913 cont (mk_var_token lexeme) buf'
915 where lexeme = lexemeToFastString buf'
918 lex_con cont glaexts buf =
919 case expandWhile# is_ident buf of { buf1 ->
920 case slurp_trailing_hashes buf1 glaexts of { buf' ->
922 case currentChar# buf' of
927 just_a_conid = --trace ("con: "++unpackFS lexeme) $
928 cont (ITconid lexeme) buf'
929 lexeme = lexemeToFastString buf'
930 munch = lex_qid cont glaexts lexeme (incLexeme buf') just_a_conid
933 lex_qid cont glaexts mod buf just_a_conid =
934 case currentChar# buf of
935 '['# -> -- Special case for []
936 case lookAhead# buf 1# of
937 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
940 '('# -> -- Special case for (,,,)
941 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
942 case lookAhead# buf 1# of
943 '#'# | flag glaexts -> case lookAhead# buf 2# of
944 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
947 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
948 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
951 '-'# -> case lookAhead# buf 1# of
952 '>'# -> cont (ITqconid (mod,SLIT("->"))) (setCurrentPos# buf 2#)
953 _ -> lex_id3 cont glaexts mod buf just_a_conid
954 _ -> lex_id3 cont glaexts mod buf just_a_conid
956 lex_id3 cont glaexts mod buf just_a_conid
957 | is_symbol (currentChar# buf) =
959 start_new_lexeme = stepOverLexeme buf
961 case expandWhile# is_symbol start_new_lexeme of { buf' ->
963 lexeme = lexemeToFastString buf'
964 -- real lexeme is M.<sym>
965 new_buf = mergeLexemes buf buf'
967 cont (mk_qvar_token mod lexeme) new_buf
968 -- wrong, but arguably morally right: M... is now a qvarsym
973 start_new_lexeme = stepOverLexeme buf
975 case expandWhile# is_ident start_new_lexeme of { buf1 ->
980 case slurp_trailing_hashes buf1 glaexts of { buf' ->
983 lexeme = lexemeToFastString buf'
984 new_buf = mergeLexemes buf buf'
985 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
987 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
988 Just kwd_token -> just_a_conid; -- avoid M.where etc.
989 Nothing -> is_a_qvarid
990 -- TODO: special ids (as, qualified, hiding) shouldn't be
991 -- recognised as keywords here. ie. M.as is a qualified varid.
995 slurp_trailing_hashes buf glaexts
996 | flag glaexts = expandWhile# (`eqChar#` '#'#) buf
1001 | is_upper f = ITconid pk_str
1002 | is_ident f = ITvarid pk_str
1003 | f `eqChar#` ':'# = ITconsym pk_str
1004 | otherwise = ITvarsym pk_str
1006 (C# f) = _HEAD_ pk_str
1009 mk_qvar_token m token =
1010 case mk_var_token token of
1011 ITconid n -> ITqconid (m,n)
1012 ITvarid n -> ITqvarid (m,n)
1013 ITconsym n -> ITqconsym (m,n)
1014 ITvarsym n -> ITqvarsym (m,n)
1015 _ -> ITunknown (show token)
1018 ----------------------------------------------------------------------------
1019 Horrible stuff for dealing with M.(,,,)
1022 lex_tuple cont mod buf back_off =
1026 case currentChar# buf of
1027 ','# -> go (n+1) (stepOn buf)
1028 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1031 lex_ubx_tuple cont mod buf back_off =
1035 case currentChar# buf of
1036 ','# -> go (n+1) (stepOn buf)
1037 '#'# -> case lookAhead# buf 1# of
1038 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1044 -----------------------------------------------------------------------------
1045 doDiscard rips along really fast, looking for a '#-}',
1046 indicating the end of the pragma we're skipping
1049 doDiscard inStr buf =
1050 case currentChar# buf of
1052 case lookAhead# buf 1# of { '#'# ->
1053 case lookAhead# buf 2# of { '-'# ->
1054 case lookAhead# buf 3# of { '}'# ->
1055 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
1056 _ -> doDiscard inStr (incLexeme buf) };
1057 _ -> doDiscard inStr (incLexeme buf) };
1058 _ -> doDiscard inStr (incLexeme buf) }
1061 odd_slashes buf flg i# =
1062 case lookAhead# buf i# of
1063 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1066 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1067 '\\'# -> -- escaping something..
1068 if odd_slashes buf True (negateInt# 2#) then
1069 -- odd number of slashes, " is escaped.
1070 doDiscard inStr (incLexeme buf)
1072 -- even number of slashes, \ is escaped.
1073 doDiscard (not inStr) (incLexeme buf)
1074 _ -> case inStr of -- forced to avoid build-up
1075 True -> doDiscard False (incLexeme buf)
1076 False -> doDiscard True (incLexeme buf)
1077 _ -> doDiscard inStr (incLexeme buf)
1081 -----------------------------------------------------------------------------
1092 data PState = PState {
1094 glasgow_exts :: Int#,
1097 context :: [LayoutContext]
1100 type P a = StringBuffer -- Input string
1105 returnP a buf s = POk s a
1107 thenP :: P a -> (a -> P b) -> P b
1108 m `thenP` k = \ buf s ->
1110 POk s1 a -> k a buf s1
1111 PFailed err -> PFailed err
1113 thenP_ :: P a -> P b -> P b
1114 m `thenP_` k = m `thenP` \_ -> k
1116 mapP :: (a -> P b) -> [a] -> P [b]
1117 mapP f [] = returnP []
1120 mapP f as `thenP` \bs ->
1123 failP :: String -> P a
1124 failP msg buf s = PFailed (text msg)
1126 failMsgP :: Message -> P a
1127 failMsgP msg buf s = PFailed msg
1129 lexError :: String -> P a
1130 lexError str buf s@PState{ loc = loc }
1131 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1133 getSrcLocP :: P SrcLoc
1134 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1136 getSrcFile :: P FAST_STRING
1137 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1139 getContext :: P [LayoutContext]
1140 getContext buf s@(PState{ context = ctx }) = POk s ctx
1142 pushContext :: LayoutContext -> P ()
1143 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1147 This special case in layoutOn is to handle layout contexts with are
1148 indented the same or less than the current context. This is illegal
1149 according to the Haskell spec, so we have to arrange to close the
1150 current context. eg.
1155 after the first 'where', the sequence of events is:
1157 - layout system inserts a ';' (column 0)
1158 - parser begins a new context at column 0
1159 - parser shifts ';' (legal empty declaration)
1160 - parser sees 'class': parse error (we're still in the inner context)
1162 trouble is, by the time we know we need a new context, the lexer has
1163 already generated the ';'. Hacky solution is as follows: since we
1164 know the column of the next token (it's the column number of the new
1165 context), we set the ACTUAL column number of the new context to this
1166 numer plus one. Hence the next time the lexer is called, a '}' will
1167 be generated to close the new context straight away. Furthermore, we
1168 have to set the atbol flag so that the ';' that the parser shifted as
1169 part of the new context is re-generated.
1171 when the new context is *less* indented than the current one:
1173 f = f where g = g where
1176 - current context: column 12.
1177 - on seeing 'h' (column 0), the layout system inserts '}'
1178 - parser starts a new context, column 0
1179 - parser sees '}', uses it to close new context
1180 - we still need to insert another '}' followed by a ';',
1181 hence the atbol trick.
1183 There's also a special hack in here to deal with
1190 i.e. the inner context is at the same indentation level as the outer
1191 context. This is strictly illegal according to Haskell 98, but
1192 there's a lot of existing code using this style and it doesn't make
1193 any sense to disallow it, since empty 'do' lists don't make sense.
1196 layoutOn :: Bool -> P ()
1197 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1198 let offset = lexemeIndex buf -# bol in
1201 | if strict then prev_off >=# offset else prev_off ># offset ->
1202 --trace ("layout on, column: " ++ show (I# offset)) $
1203 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1205 --trace ("layout on, column: " ++ show (I# offset)) $
1206 POk s{ context = Layout offset : ctx } ()
1209 layoutOff buf s@(PState{ context = ctx }) =
1210 POk s{ context = NoLayout:ctx } ()
1213 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1215 (_:tl) -> POk s{ context = tl } ()
1216 [] -> PFailed (srcParseErr buf loc)
1219 Note that if the name of the file we're processing ends
1220 with `hi-boot', we accept it on faith as having the right
1221 version. This is done so that .hi-boot files that comes
1222 with hsc don't have to be updated before every release,
1223 *and* it allows us to share .hi-boot files with versions
1224 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1226 If the version number is 0, the checking is also turned off.
1227 (needed to deal with GHC.hi only!)
1229 Once we can assume we're compiling with a version of ghc that
1230 supports interface file checking, we can drop the special
1233 checkVersion :: Maybe Integer -> P ()
1234 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1235 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1236 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1237 checkVersion mb@Nothing buf s@(PState{loc = loc})
1238 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1239 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1241 -----------------------------------------------------------------
1243 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1245 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1246 text (lexemeToString s), char '\'']
1248 ifaceVersionErr hi_vers l toks
1249 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1250 ptext SLIT("Expected"), int opt_HiVersion,
1251 ptext SLIT("found "), pp_version]
1255 Nothing -> ptext SLIT("pre ghc-3.02 version")
1256 Just v -> ptext SLIT("version") <+> integer v
1258 -----------------------------------------------------------------------------
1260 srcParseErr :: StringBuffer -> SrcLoc -> Message
1264 then ptext SLIT(": parse error (possibly incorrect indentation)")
1265 else hcat [ptext SLIT(": parse error on input "),
1266 char '`', text token, char '\'']
1269 token = lexemeToString s