2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "hs_ctype.h" #-}
22 ifaceParseErr, srcParseErr,
25 Token(..), lexer, ParseResult(..), PState(..),
29 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
30 getSrcLocP, setSrcLocP, getSrcFile,
31 layoutOn, layoutOff, pushContext, popContext
34 #include "HsVersions.h"
36 import Char ( isSpace, toUpper )
37 import List ( isSuffixOf )
39 import IdInfo ( InlinePragInfo(..) )
40 import PrelNames ( mkTupNameStr )
41 import CmdLineOpts ( opt_HiVersion, opt_NoHiCheck )
42 import Demand ( Demand(..) {- instance Read -} )
43 import UniqFM ( listToUFM, lookupUFM )
44 import BasicTypes ( NewOrData(..), Boxity(..) )
45 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
46 replaceSrcLine, mkSrcLoc )
48 import ErrUtils ( Message )
55 import Char ( chr, ord )
56 import PrelRead ( readRational__ ) -- Glasgow non-std
59 %************************************************************************
61 \subsection{Data types}
63 %************************************************************************
65 The token data type, fairly un-interesting except from one
66 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
67 strictness, unfolding etc).
69 The Idea/Observation here is that the renamer needs to scan through
70 all of an interface file before it can continue. But only a fraction
71 of the information contained in the file turns out to be useful, so
72 delaying as much as possible of the scanning and parsing of an
73 interface file Makes Sense (Heap profiles of the compiler
74 show a reduction in heap usage by at least a factor of two,
77 Hence, the interface file lexer spots when value declarations are
78 being scanned and return the @ITidinfo@ and @ITtype@ constructors
79 for the type and any other id info for that binding (unfolding, strictness
80 etc). These constructors are applied to the result of lexing these sub-chunks.
82 The lexing of the type and id info is all done lazily, of course, so
83 the scanning (and subsequent parsing) will be done *only* on the ids the
84 renamer finds out that it is interested in. The rest will just be junked.
85 Laziness, you know it makes sense :-)
89 = ITas -- Haskell keywords
113 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
115 | ITforall -- GHC extension keywords
125 | ITinterface -- interface keywords
133 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
152 | ITunfold InlinePragInfo
153 | ITstrict ([Demand], Bool)
160 | ITspecialise_prag -- Pragmas
170 | ITdotdot -- reserved symbols
184 | ITbiglam -- GHC-extension symbols
186 | ITocurly -- special symbols
188 | ITocurlybar -- {|, for type applications
189 | ITccurlybar -- |}, for type applications
202 | ITvarid FAST_STRING -- identifiers
203 | ITconid FAST_STRING
204 | ITvarsym FAST_STRING
205 | ITconsym FAST_STRING
206 | ITqvarid (FAST_STRING,FAST_STRING)
207 | ITqconid (FAST_STRING,FAST_STRING)
208 | ITqvarsym (FAST_STRING,FAST_STRING)
209 | ITqconsym (FAST_STRING,FAST_STRING)
211 | ITipvarid FAST_STRING -- GHC extension: implicit param: ?x
213 | ITpragma StringBuffer
216 | ITstring FAST_STRING
218 | ITrational Rational
221 | ITprimstring FAST_STRING
223 | ITprimfloat Rational
224 | ITprimdouble Rational
225 | ITlitlit FAST_STRING
227 | ITunknown String -- Used when the lexer can't make sense of it
228 | ITeof -- end of file token
229 deriving Show -- debugging
232 -----------------------------------------------------------------------------
236 pragmaKeywordsFM = listToUFM $
237 map (\ (x,y) -> (_PK_ x,y))
238 [( "SPECIALISE", ITspecialise_prag ),
239 ( "SPECIALIZE", ITspecialise_prag ),
240 ( "SOURCE", ITsource_prag ),
241 ( "INLINE", ITinline_prag ),
242 ( "NOINLINE", ITnoinline_prag ),
243 ( "NOTINLINE", ITnoinline_prag ),
244 ( "LINE", ITline_prag ),
245 ( "RULES", ITrules_prag ),
246 ( "RULEZ", ITrules_prag ), -- american spelling :-)
247 ( "SCC", ITscc_prag ),
248 ( "DEPRECATED", ITdeprecated_prag )
251 haskellKeywordsFM = listToUFM $
252 map (\ (x,y) -> (_PK_ x,y))
253 [( "_", ITunderscore ),
256 ( "class", ITclass ),
258 ( "default", ITdefault ),
259 ( "deriving", ITderiving ),
262 ( "hiding", IThiding ),
264 ( "import", ITimport ),
266 ( "infix", ITinfix ),
267 ( "infixl", ITinfixl ),
268 ( "infixr", ITinfixr ),
269 ( "instance", ITinstance ),
271 ( "module", ITmodule ),
272 ( "newtype", ITnewtype ),
274 ( "qualified", ITqualified ),
277 ( "where", ITwhere ),
278 ( "_scc_", ITscc ) -- ToDo: remove
281 isSpecial :: Token -> Bool
282 -- If we see M.x, where x is a keyword, but
283 -- is special, we treat is as just plain M.x,
285 isSpecial ITas = True
286 isSpecial IThiding = True
287 isSpecial ITqualified = True
288 isSpecial ITforall = True
289 isSpecial ITexport = True
290 isSpecial ITlabel = True
291 isSpecial ITdynamic = True
292 isSpecial ITunsafe = True
293 isSpecial ITwith = True
294 isSpecial ITccallconv = True
295 isSpecial ITstdcallconv = True
298 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
299 ghcExtensionKeywordsFM = listToUFM $
300 map (\ (x,y) -> (_PK_ x,y))
301 [ ( "forall", ITforall ),
302 ( "foreign", ITforeign ),
303 ( "export", ITexport ),
304 ( "label", ITlabel ),
305 ( "dynamic", ITdynamic ),
306 ( "unsafe", ITunsafe ),
308 ( "stdcall", ITstdcallconv),
309 ( "ccall", ITccallconv),
310 ("_ccall_", ITccall (False, False, False)),
311 ("_ccall_GC_", ITccall (False, False, True)),
312 ("_casm_", ITccall (False, True, False)),
313 ("_casm_GC_", ITccall (False, True, True)),
315 -- interface keywords
316 ("__interface", ITinterface),
317 ("__export", IT__export),
318 ("__depends", ITdepends),
319 ("__forall", IT__forall),
320 ("__letrec", ITletrec),
321 ("__coerce", ITcoerce),
322 ("__inline_me", ITinlineMe),
323 ("__inline_call", ITinlineCall),
324 ("__depends", ITdepends),
325 ("__DEFAULT", ITdefaultbranch),
327 ("__integer", ITinteger_lit),
328 ("__float", ITfloat_lit),
329 ("__int64", ITint64_lit),
330 ("__word", ITword_lit),
331 ("__word64", ITword64_lit),
332 ("__rational", ITrational_lit),
333 ("__addr", ITaddr_lit),
334 ("__label", ITlabel_lit),
335 ("__litlit", ITlit_lit),
336 ("__string", ITstring_lit),
339 ("__fuall", ITfuall),
341 ("__P", ITspecialise),
344 ("__D", ITdeprecated),
345 ("__U", ITunfold NoInlinePragInfo),
347 ("__ccall", ITccall (False, False, False)),
348 ("__ccall_GC", ITccall (False, False, True)),
349 ("__dyn_ccall", ITccall (True, False, False)),
350 ("__dyn_ccall_GC", ITccall (True, False, True)),
351 ("__casm", ITccall (False, True, False)),
352 ("__dyn_casm", ITccall (True, True, False)),
353 ("__casm_GC", ITccall (False, True, True)),
354 ("__dyn_casm_GC", ITccall (True, True, True)),
360 haskellKeySymsFM = listToUFM $
361 map (\ (x,y) -> (_PK_ x,y))
374 ,(".", ITdot) -- sadly, for 'forall a . t'
378 -----------------------------------------------------------------------------
383 - (glaexts) lexing an interface file or -fglasgow-exts
384 - (bol) pointer to beginning of line (for column calculations)
385 - (buf) pointer to beginning of token
386 - (buf) pointer to current char
387 - (atbol) flag indicating whether we're at the beginning of a line
390 lexer :: (Token -> P a) -> P a
391 lexer cont buf s@(PState{
393 glasgow_exts = glaexts,
399 -- first, start a new lexeme and lose all the whitespace
400 = tab line bol atbol (stepOverLexeme buf)
402 line = srcLocLine loc
404 tab y bol atbol buf = -- trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
405 case currentChar# buf of
408 if bufferExhausted (stepOn buf)
409 then cont ITeof buf s'
410 else trace "lexer: misplaced NUL?" $
411 tab y bol atbol (stepOn buf)
413 '\n'# -> let buf' = stepOn buf
414 in tab (y +# 1#) (currentIndex# buf') 1# buf'
416 -- find comments. This got harder in Haskell 98.
417 '-'# -> let trundle n =
418 let next = lookAhead# buf n in
419 if next `eqChar#` '-'# then trundle (n +# 1#)
420 else if is_symbol next || n <# 2#
423 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
426 -- comments and pragmas. We deal with LINE pragmas here,
427 -- and throw out any unrecognised pragmas as comments. Any
428 -- pragmas we know about are dealt with later (after any layout
429 -- processing if necessary).
430 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
431 if lookAhead# buf 2# `eqChar#` '#'# then
432 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
433 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
434 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
435 let lexeme = mkFastString -- ToDo: too slow
436 (map toUpper (lexemeToString buf2)) in
437 case lookupUFM pragmaKeywordsFM lexeme of
439 line_prag skip_to_end buf2 s'
440 Just other -> is_a_token
441 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
444 else skip_to_end (stepOnBy# buf 2#) s'
446 skip_to_end = nested_comment (lexer cont)
448 -- special GHC extension: we grok cpp-style #line pragmas
449 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
450 case expandWhile# is_space (stepOn buf) of { buf1 ->
451 if is_digit (currentChar# buf1)
452 then line_prag next_line buf1 s'
456 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
458 -- tabs have been expanded beforehand
459 c | is_space c -> tab y bol atbol (stepOn buf)
460 | otherwise -> is_a_token
462 where s' = s{loc = replaceSrcLine loc y,
466 is_a_token | atbol /=# 0# = lexBOL cont buf s'
467 | otherwise = lexToken cont glaexts buf s'
469 -- {-# LINE .. #-} pragmas. yeuch.
470 line_prag cont buf s@PState{loc=loc} =
471 case expandWhile# is_space buf of { buf1 ->
472 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
473 -- subtract one: the line number refers to the *following* line.
474 let real_line = line - 1 in
475 case fromInteger real_line of { i@(I# l) ->
476 -- ToDo, if no filename then we skip the newline.... d'oh
477 case expandWhile# is_space buf2 of { buf3 ->
478 case currentChar# buf3 of
480 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
482 file = lexemeToFastString buf4
483 new_buf = stepOn (stepOverLexeme buf4)
485 if nullFastString file
486 then cont new_buf s{loc = replaceSrcLine loc l}
487 else cont new_buf s{loc = mkSrcLoc file i}
489 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
492 nested_comment :: P a -> P a
493 nested_comment cont buf = loop buf
496 case currentChar# buf of
497 '\NUL'# | bufferExhausted (stepOn buf) ->
498 lexError "unterminated `{-'" buf -- -}
499 '-'# | lookAhead# buf 1# `eqChar#` '}'# ->
500 cont (stepOnBy# buf 2#)
502 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
503 nested_comment (nested_comment cont) (stepOnBy# buf 2#)
505 '\n'# -> \ s@PState{loc=loc} ->
506 let buf' = stepOn buf in
507 nested_comment cont buf'
508 s{loc = incSrcLine loc, bol = currentIndex# buf',
511 _ -> nested_comment cont (stepOn buf)
513 -- When we are lexing the first token of a line, check whether we need to
514 -- insert virtual semicolons or close braces due to layout.
516 lexBOL :: (Token -> P a) -> P a
517 lexBOL cont buf s@(PState{
519 glasgow_exts = glaexts,
524 if need_close_curly then
525 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
526 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
527 else if need_semi_colon then
528 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
529 cont ITsemi buf s{atbol = 0#}
531 lexToken cont glaexts buf s{atbol = 0#}
533 col = currentIndex# buf -# bol
546 Layout n -> col ==# n
549 lexToken :: (Token -> P a) -> Int# -> P a
550 lexToken cont glaexts buf =
551 -- trace "lexToken" $
552 case currentChar# buf of
554 -- special symbols ----------------------------------------------------
555 '('# | flag glaexts && lookAhead# buf 1# `eqChar#` '#'#
556 -> cont IToubxparen (setCurrentPos# buf 2#)
558 -> cont IToparen (incLexeme buf)
560 ')'# -> cont ITcparen (incLexeme buf)
561 '['# -> cont ITobrack (incLexeme buf)
562 ']'# -> cont ITcbrack (incLexeme buf)
563 ','# -> cont ITcomma (incLexeme buf)
564 ';'# -> cont ITsemi (incLexeme buf)
565 '}'# -> \ s@PState{context = ctx} ->
567 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
568 _ -> lexError "too many '}'s" buf s
569 '|'# -> case lookAhead# buf 1# of
570 '}'# | flag glaexts -> cont ITccurlybar
571 (setCurrentPos# buf 2#)
572 _ -> lex_sym cont (incLexeme buf)
575 '#'# -> case lookAhead# buf 1# of
576 ')'# | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
577 '-'# -> case lookAhead# buf 2# of
578 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
579 _ -> lex_sym cont (incLexeme buf)
580 _ -> lex_sym cont (incLexeme buf)
582 '`'# | flag glaexts && lookAhead# buf 1# `eqChar#` '`'#
583 -> lex_cstring cont (setCurrentPos# buf 2#)
585 -> cont ITbackquote (incLexeme buf)
587 '{'# -> -- look for "{-##" special iface pragma
588 case lookAhead# buf 1# of
590 -> cont ITocurlybar (setCurrentPos# buf 2#)
591 '-'# -> case lookAhead# buf 2# of
592 '#'# -> case lookAhead# buf 3# of
595 = doDiscard 0# (stepOnBy# (stepOverLexeme buf) 4#) in
596 cont (ITpragma lexeme) buf'
597 _ -> lex_prag cont (setCurrentPos# buf 3#)
598 _ -> cont ITocurly (incLexeme buf)
599 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
601 -- strings/characters -------------------------------------------------
602 '\"'#{-"-} -> lex_string cont glaexts [] (incLexeme buf)
603 '\''# -> lex_char (char_end cont) glaexts (incLexeme buf)
605 -- strictness and cpr pragmas and __scc treated specially.
606 '_'# | flag glaexts ->
607 case lookAhead# buf 1# of
608 '_'# -> case lookAhead# buf 2# of
610 lex_demand cont (stepOnUntil (not . isSpace)
611 (stepOnBy# buf 3#)) -- past __S
613 cont ITcprinfo (stepOnBy# buf 3#) -- past __M
616 case prefixMatch (stepOnBy# buf 3#) "cc" of
617 Just buf' -> lex_scc cont (stepOverLexeme buf')
618 Nothing -> lex_id cont glaexts buf
619 _ -> lex_id cont glaexts buf
620 _ -> lex_id cont glaexts buf
622 -- Hexadecimal and octal constants
623 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
624 -> readNum (after_lexnum cont glaexts) buf' is_hexdigit 16 hex
625 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
626 -> readNum (after_lexnum cont glaexts) buf' is_octdigit 8 oct_or_dec
627 where ch = lookAhead# buf 1#
628 ch2 = lookAhead# buf 2#
629 buf' = setCurrentPos# buf 2#
632 if bufferExhausted (stepOn buf) then
635 trace "lexIface: misplaced NUL?" $
636 cont (ITunknown "\NUL") (stepOn buf)
638 '?'# | flag glaexts && is_lower (lookAhead# buf 1#) ->
639 lex_ip cont (incLexeme buf)
640 c | is_digit c -> lex_num cont glaexts 0 buf
641 | is_symbol c -> lex_sym cont buf
642 | is_upper c -> lex_con cont glaexts buf
643 | is_ident c -> lex_id cont glaexts buf
644 | otherwise -> lexError "illegal character" buf
646 -- Int# is unlifted, and therefore faster than Bool for flags.
652 -------------------------------------------------------------------------------
656 = case expandWhile# is_space buf of { buf1 ->
657 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
658 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
659 case lookupUFM pragmaKeywordsFM lexeme of
660 Just kw -> cont kw (mergeLexemes buf buf2)
661 Nothing -> panic "lex_prag"
664 -------------------------------------------------------------------------------
667 lex_string cont glaexts s buf
668 = case currentChar# buf of
670 let buf' = incLexeme buf
671 s' = mkFastStringNarrow (map chr (reverse s))
672 in case currentChar# buf' of
673 '#'# | flag glaexts -> if all (<= 0xFF) s
674 then cont (ITprimstring s') (incLexeme buf')
675 else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
676 _ -> cont (ITstring s') buf'
678 -- ignore \& in a string, deal with string gaps
679 '\\'# | next_ch `eqChar#` '&'#
680 -> lex_string cont glaexts s buf'
682 -> lex_stringgap cont glaexts s (incLexeme buf)
684 where next_ch = lookAhead# buf 1#
685 buf' = setCurrentPos# buf 2#
687 _ -> lex_char (lex_next_string cont s) glaexts buf
689 lex_stringgap cont glaexts s buf
690 = let buf' = incLexeme buf in
691 case currentChar# buf of
692 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont glaexts s buf'
693 st{loc = incSrcLine loc}
694 '\\'# -> lex_string cont glaexts s buf'
695 c | is_space c -> lex_stringgap cont glaexts s buf'
696 other -> charError buf'
698 lex_next_string cont s glaexts c buf = lex_string cont glaexts (c:s) buf
700 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
701 lex_char cont glaexts buf
702 = case currentChar# buf of
703 '\\'# -> lex_escape (cont glaexts) (incLexeme buf)
704 c | is_string c -> cont glaexts (I# (ord# c)) (incLexeme buf)
705 other -> charError buf
707 char_end cont glaexts c buf
708 = case currentChar# buf of
709 '\''# -> let buf' = incLexeme buf in
710 case currentChar# buf' of
712 -> cont (ITprimchar c) (incLexeme buf')
713 _ -> cont (ITchar c) buf'
717 = let buf' = incLexeme buf in
718 case currentChar# buf of
719 'a'# -> cont (ord '\a') buf'
720 'b'# -> cont (ord '\b') buf'
721 'f'# -> cont (ord '\f') buf'
722 'n'# -> cont (ord '\n') buf'
723 'r'# -> cont (ord '\r') buf'
724 't'# -> cont (ord '\t') buf'
725 'v'# -> cont (ord '\v') buf'
726 '\\'# -> cont (ord '\\') buf'
727 '"'# -> cont (ord '\"') buf'
728 '\''# -> cont (ord '\'') buf'
729 '^'# -> let c = currentChar# buf' in
730 if c `geChar#` '@'# && c `leChar#` '_'#
731 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
734 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
735 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
737 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
739 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
740 Just buf2 <- [prefixMatch buf p] ] of
741 (c,buf2):_ -> cont (ord c) buf2
744 after_charnum cont i buf = cont (fromInteger i) buf
746 readNum cont buf is_digit base conv = read buf 0
748 = case currentChar# buf of { c ->
750 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
756 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
757 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
759 hex c | is_digit c = ord# c -# ord# '0'#
760 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
761 oct_or_dec c = ord# c -# ord# '0'#
763 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
766 | c `geChar#` 'A'# && c `leChar#` 'Z'#
767 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
770 charError buf = lexError "error in character literal" buf
772 silly_escape_chars = [
809 -------------------------------------------------------------------------------
811 lex_demand cont buf =
812 case read_em [] buf of { (ls,buf') ->
813 case currentChar# buf' of
814 'B'# -> cont (ITstrict (ls, True )) (incLexeme buf')
815 _ -> cont (ITstrict (ls, False)) buf'
818 -- code snatched from Demand.lhs
820 case currentChar# buf of
821 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
822 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
823 'S'# -> read_em (WwStrict : acc) (stepOn buf)
824 'P'# -> read_em (WwPrim : acc) (stepOn buf)
825 'E'# -> read_em (WwEnum : acc) (stepOn buf)
826 ')'# -> (reverse acc, stepOn buf)
827 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
828 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
829 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
830 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
831 _ -> (reverse acc, buf)
833 do_unpack new_or_data wrapper_unpacks acc buf
834 = case read_em [] buf of
835 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
840 case currentChar# buf of
841 'C'# -> cont ITsccAllCafs (incLexeme buf)
842 other -> cont ITscc buf
844 -----------------------------------------------------------------------------
847 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
848 lex_num cont glaexts acc buf =
849 case scanNumLit acc buf of
851 case currentChar# buf' of
852 '.'# | is_digit (lookAhead# buf' 1#) ->
853 -- this case is not optimised at all, as the
854 -- presence of floating point numbers in interface
855 -- files is not that common. (ToDo)
856 case expandWhile# is_digit (incLexeme buf') of
857 buf2 -> -- points to first non digit char
859 let l = case currentChar# buf2 of
865 = let buf3 = incLexeme buf2 in
866 case currentChar# buf3 of
867 '-'# -> expandWhile# is_digit (incLexeme buf3)
868 '+'# -> expandWhile# is_digit (incLexeme buf3)
869 x | is_digit x -> expandWhile# is_digit buf3
872 v = readRational__ (lexemeToString l)
874 in case currentChar# l of -- glasgow exts only
875 '#'# | flag glaexts -> let l' = incLexeme l in
876 case currentChar# l' of
877 '#'# -> cont (ITprimdouble v) (incLexeme l')
878 _ -> cont (ITprimfloat v) l'
879 _ -> cont (ITrational v) l
881 _ -> after_lexnum cont glaexts acc' buf'
883 after_lexnum cont glaexts i buf
884 = case currentChar# buf of
885 '#'# | flag glaexts -> cont (ITprimint i) (incLexeme buf)
886 _ -> cont (ITinteger i) buf
888 -----------------------------------------------------------------------------
889 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
891 -- we lexemeToFastString on the bit between the ``''s, but include the
892 -- quotes in the full lexeme.
894 lex_cstring cont buf =
895 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
896 Just buf' -> cont (ITlitlit (lexemeToFastString
897 (setCurrentPos# buf' (negateInt# 2#))))
898 (mergeLexemes buf buf')
899 Nothing -> lexError "unterminated ``" buf
901 -----------------------------------------------------------------------------
902 -- identifiers, symbols etc.
905 case expandWhile# is_ident buf of
906 buf' -> cont (ITipvarid lexeme) buf'
907 where lexeme = lexemeToFastString buf'
909 lex_id cont glaexts buf =
910 let buf1 = expandWhile# is_ident buf in
913 case (if flag glaexts
914 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
915 else buf1) of { buf' ->
917 let lexeme = lexemeToFastString buf' in
919 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
920 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
924 let var_token = cont (ITvarid lexeme) buf' in
926 if not (flag glaexts)
930 case lookupUFM ghcExtensionKeywordsFM lexeme of {
931 Just kwd_token -> cont kwd_token buf';
938 case expandWhile# is_symbol buf of
939 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
940 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
941 cont kwd_token buf' ;
942 Nothing -> --trace ("sym: "++unpackFS lexeme) $
943 cont (mk_var_token lexeme) buf'
945 where lexeme = lexemeToFastString buf'
948 lex_con cont glaexts buf =
949 -- trace ("con: "{-++unpackFS lexeme-}) $
950 case expandWhile# is_ident buf of { buf1 ->
951 case slurp_trailing_hashes buf1 glaexts of { buf' ->
953 case currentChar# buf' of
958 just_a_conid = cont (ITconid lexeme) buf'
959 lexeme = lexemeToFastString buf'
960 munch = lex_qid cont glaexts lexeme (incLexeme buf') just_a_conid
963 lex_qid cont glaexts mod buf just_a_conid =
964 -- trace ("quid: "{-++unpackFS lexeme-}) $
965 case currentChar# buf of
966 '['# -> -- Special case for []
967 case lookAhead# buf 1# of
968 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
971 '('# -> -- Special case for (,,,)
972 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
973 case lookAhead# buf 1# of
974 '#'# | flag glaexts -> case lookAhead# buf 2# of
975 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
978 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
979 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
982 '-'# -> case lookAhead# buf 1# of
983 '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
984 _ -> lex_id3 cont glaexts mod buf just_a_conid
985 _ -> lex_id3 cont glaexts mod buf just_a_conid
987 lex_id3 cont glaexts mod buf just_a_conid
988 | is_symbol (currentChar# buf) =
990 start_new_lexeme = stepOverLexeme buf
992 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
993 case expandWhile# is_symbol start_new_lexeme of { buf' ->
995 lexeme = lexemeToFastString buf'
996 -- real lexeme is M.<sym>
997 new_buf = mergeLexemes buf buf'
999 cont (mk_qvar_token mod lexeme) new_buf
1000 -- wrong, but arguably morally right: M... is now a qvarsym
1005 start_new_lexeme = stepOverLexeme buf
1007 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1008 case expandWhile# is_ident start_new_lexeme of { buf1 ->
1013 case slurp_trailing_hashes buf1 glaexts of { buf' ->
1016 lexeme = lexemeToFastString buf'
1017 new_buf = mergeLexemes buf buf'
1018 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1020 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1021 Nothing -> is_a_qvarid ;
1023 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
1024 -> is_a_qvarid -- recognised as keywords here.
1026 -> just_a_conid -- avoid M.where etc.
1029 slurp_trailing_hashes buf glaexts
1030 | flag glaexts = expandWhile# (`eqChar#` '#'#) buf
1035 | is_upper f = ITconid pk_str
1036 | is_ident f = ITvarid pk_str
1037 | f `eqChar#` ':'# = ITconsym pk_str
1038 | otherwise = ITvarsym pk_str
1040 (C# f) = _HEAD_ pk_str
1041 -- tl = _TAIL_ pk_str
1043 mk_qvar_token m token =
1044 -- trace ("mk_qvar ") $
1045 case mk_var_token token of
1046 ITconid n -> ITqconid (m,n)
1047 ITvarid n -> ITqvarid (m,n)
1048 ITconsym n -> ITqconsym (m,n)
1049 ITvarsym n -> ITqvarsym (m,n)
1050 _ -> ITunknown (show token)
1053 ----------------------------------------------------------------------------
1054 Horrible stuff for dealing with M.(,,,)
1057 lex_tuple cont mod buf back_off =
1061 case currentChar# buf of
1062 ','# -> go (n+1) (stepOn buf)
1063 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1066 lex_ubx_tuple cont mod buf back_off =
1070 case currentChar# buf of
1071 ','# -> go (n+1) (stepOn buf)
1072 '#'# -> case lookAhead# buf 1# of
1073 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1079 -----------------------------------------------------------------------------
1080 doDiscard rips along really fast, looking for a '##-}',
1081 indicating the end of the pragma we're skipping
1084 doDiscard inStr buf =
1085 case currentChar# buf of
1086 '#'# | inStr ==# 0# ->
1087 case lookAhead# buf 1# of { '#'# ->
1088 case lookAhead# buf 2# of { '-'# ->
1089 case lookAhead# buf 3# of { '}'# ->
1090 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
1091 _ -> doDiscard inStr (incLexeme buf) };
1092 _ -> doDiscard inStr (incLexeme buf) };
1093 _ -> doDiscard inStr (incLexeme buf) }
1097 odd_slashes buf flg i# =
1098 case lookAhead# buf i# of
1099 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1102 not_inStr = if inStr ==# 0# then 1# else 0#
1104 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1105 '\\'# -> -- escaping something..
1106 if odd_slashes buf True (negateInt# 2#)
1107 then -- odd number of slashes, " is escaped.
1108 doDiscard inStr (incLexeme buf)
1109 else -- even number of slashes, \ is escaped.
1110 doDiscard not_inStr (incLexeme buf)
1111 _ -> doDiscard not_inStr (incLexeme buf)
1113 '\''# | inStr ==# 0# ->
1114 case lookAhead# buf 1# of { '"'# ->
1115 case lookAhead# buf 2# of { '\''# ->
1116 doDiscard inStr (setCurrentPos# buf 3#);
1117 _ -> doDiscard inStr (incLexeme buf) };
1118 _ -> doDiscard inStr (incLexeme buf) }
1120 _ -> doDiscard inStr (incLexeme buf)
1124 -----------------------------------------------------------------------------
1135 data PState = PState {
1137 glasgow_exts :: Int#,
1140 context :: [LayoutContext]
1143 type P a = StringBuffer -- Input string
1148 returnP a buf s = POk s a
1150 thenP :: P a -> (a -> P b) -> P b
1151 m `thenP` k = \ buf s ->
1153 POk s1 a -> k a buf s1
1154 PFailed err -> PFailed err
1156 thenP_ :: P a -> P b -> P b
1157 m `thenP_` k = m `thenP` \_ -> k
1159 mapP :: (a -> P b) -> [a] -> P [b]
1160 mapP f [] = returnP []
1163 mapP f as `thenP` \bs ->
1166 failP :: String -> P a
1167 failP msg buf s = PFailed (text msg)
1169 failMsgP :: Message -> P a
1170 failMsgP msg buf s = PFailed msg
1172 lexError :: String -> P a
1173 lexError str buf s@PState{ loc = loc }
1174 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1176 getSrcLocP :: P SrcLoc
1177 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1179 -- use a temporary SrcLoc for the duration of the argument
1180 setSrcLocP :: SrcLoc -> P a -> P a
1181 setSrcLocP new_loc p buf s =
1182 case p buf s{ loc=new_loc } of
1184 PFailed e -> PFailed e
1186 getSrcFile :: P FAST_STRING
1187 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1189 getContext :: P [LayoutContext]
1190 getContext buf s@(PState{ context = ctx }) = POk s ctx
1192 pushContext :: LayoutContext -> P ()
1193 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1197 This special case in layoutOn is to handle layout contexts with are
1198 indented the same or less than the current context. This is illegal
1199 according to the Haskell spec, so we have to arrange to close the
1200 current context. eg.
1205 after the first 'where', the sequence of events is:
1207 - layout system inserts a ';' (column 0)
1208 - parser begins a new context at column 0
1209 - parser shifts ';' (legal empty declaration)
1210 - parser sees 'class': parse error (we're still in the inner context)
1212 trouble is, by the time we know we need a new context, the lexer has
1213 already generated the ';'. Hacky solution is as follows: since we
1214 know the column of the next token (it's the column number of the new
1215 context), we set the ACTUAL column number of the new context to this
1216 numer plus one. Hence the next time the lexer is called, a '}' will
1217 be generated to close the new context straight away. Furthermore, we
1218 have to set the atbol flag so that the ';' that the parser shifted as
1219 part of the new context is re-generated.
1221 when the new context is *less* indented than the current one:
1223 f = f where g = g where
1226 - current context: column 12.
1227 - on seeing 'h' (column 0), the layout system inserts '}'
1228 - parser starts a new context, column 0
1229 - parser sees '}', uses it to close new context
1230 - we still need to insert another '}' followed by a ';',
1231 hence the atbol trick.
1233 There's also a special hack in here to deal with
1240 i.e. the inner context is at the same indentation level as the outer
1241 context. This is strictly illegal according to Haskell 98, but
1242 there's a lot of existing code using this style and it doesn't make
1243 any sense to disallow it, since empty 'do' lists don't make sense.
1246 layoutOn :: Bool -> P ()
1247 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1248 let offset = lexemeIndex buf -# bol in
1251 | if strict then prev_off >=# offset else prev_off ># offset ->
1252 --trace ("layout on, column: " ++ show (I# offset)) $
1253 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1255 --trace ("layout on, column: " ++ show (I# offset)) $
1256 POk s{ context = Layout offset : ctx } ()
1259 layoutOff buf s@(PState{ context = ctx }) =
1260 POk s{ context = NoLayout:ctx } ()
1263 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1265 (_:tl) -> POk s{ context = tl } ()
1266 [] -> PFailed (srcParseErr buf loc)
1269 Note that if the name of the file we're processing ends
1270 with `hi-boot', we accept it on faith as having the right
1271 version. This is done so that .hi-boot files that comes
1272 with hsc don't have to be updated before every release,
1273 *and* it allows us to share .hi-boot files with versions
1274 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1276 If the version number is 0, the checking is also turned off.
1277 (needed to deal with GHC.hi only!)
1279 Once we can assume we're compiling with a version of ghc that
1280 supports interface file checking, we can drop the special
1283 checkVersion :: Maybe Integer -> P ()
1284 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1285 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1286 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1287 checkVersion mb@Nothing buf s@(PState{loc = loc})
1288 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1289 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1291 -----------------------------------------------------------------
1293 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1295 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1296 text (lexemeToString s), char '\'']
1298 ifaceVersionErr hi_vers l toks
1299 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1300 ptext SLIT("Expected"), int opt_HiVersion,
1301 ptext SLIT("found "), pp_version]
1305 Nothing -> ptext SLIT("pre ghc-3.02 version")
1306 Just v -> ptext SLIT("version") <+> integer v
1308 -----------------------------------------------------------------------------
1310 srcParseErr :: StringBuffer -> SrcLoc -> Message
1314 then ptext SLIT(": parse error (possibly incorrect indentation)")
1315 else hcat [ptext SLIT(": parse error on input "),
1316 char '`', text token, char '\'']
1319 token = lexemeToString s