2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
24 Token(..), lexer, ParseResult(..), PState(..),
25 ExtFlags(..), mkPState,
28 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
29 getSrcLocP, setSrcLocP, getSrcFile,
30 layoutOn, layoutOff, pushContext, popContext
33 #include "HsVersions.h"
35 import Char ( toUpper )
37 import PrelNames ( mkTupNameStr )
38 import ForeignCall ( Safety(..) )
39 import UniqFM ( listToUFM, lookupUFM )
40 import BasicTypes ( Boxity(..) )
41 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
42 replaceSrcLine, mkSrcLoc )
44 import ErrUtils ( Message )
51 import Char ( chr, ord )
53 import Bits ( Bits(..) ) -- non-std
55 #if __GLASGOW_HASKELL__ >= 503
56 import GHC.Read ( readRational__ ) -- Glasgow non-std
58 import PrelRead ( readRational__ ) -- Glasgow non-std
63 %************************************************************************
65 \subsection{Data types}
67 %************************************************************************
69 The token data type, fairly un-interesting except from one
70 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
71 strictness, unfolding etc).
73 The Idea/Observation here is that the renamer needs to scan through
74 all of an interface file before it can continue. But only a fraction
75 of the information contained in the file turns out to be useful, so
76 delaying as much as possible of the scanning and parsing of an
77 interface file Makes Sense (Heap profiles of the compiler
78 show a reduction in heap usage by at least a factor of two,
81 Hence, the interface file lexer spots when value declarations are
82 being scanned and return the @ITidinfo@ and @ITtype@ constructors
83 for the type and any other id info for that binding (unfolding, strictness
84 etc). These constructors are applied to the result of lexing these sub-chunks.
86 The lexing of the type and id info is all done lazily, of course, so
87 the scanning (and subsequent parsing) will be done *only* on the ids the
88 renamer finds out that it is interested in. The rest will just be junked.
89 Laziness, you know it makes sense :-)
93 = ITas -- Haskell keywords
117 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
119 | ITforall -- GHC extension keywords
131 | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc)
133 | ITspecialise_prag -- Pragmas
143 | ITdotdot -- reserved symbols
158 | ITbiglam -- GHC-extension symbols
160 | ITocurly -- special symbols
162 | ITocurlybar -- {|, for type applications
163 | ITccurlybar -- |}, for type applications
166 | ITopabrack -- [:, for parallel arrays with -fparr
167 | ITcpabrack -- :], for parallel arrays with -fparr
178 | ITvarid FAST_STRING -- identifiers
179 | ITconid FAST_STRING
180 | ITvarsym FAST_STRING
181 | ITconsym FAST_STRING
182 | ITqvarid (FAST_STRING,FAST_STRING)
183 | ITqconid (FAST_STRING,FAST_STRING)
184 | ITqvarsym (FAST_STRING,FAST_STRING)
185 | ITqconsym (FAST_STRING,FAST_STRING)
187 | ITdupipvarid FAST_STRING -- GHC extension: implicit param: ?x
188 | ITsplitipvarid FAST_STRING -- GHC extension: implicit param: %x
190 | ITpragma StringBuffer
193 | ITstring FAST_STRING
195 | ITrational Rational
198 | ITprimstring FAST_STRING
200 | ITprimfloat Rational
201 | ITprimdouble Rational
202 | ITlitlit FAST_STRING
204 | ITunknown String -- Used when the lexer can't make sense of it
205 | ITeof -- end of file token
206 deriving Show -- debugging
209 -----------------------------------------------------------------------------
213 pragmaKeywordsFM = listToUFM $
214 map (\ (x,y) -> (_PK_ x,y))
215 [( "SPECIALISE", ITspecialise_prag ),
216 ( "SPECIALIZE", ITspecialise_prag ),
217 ( "SOURCE", ITsource_prag ),
218 ( "INLINE", ITinline_prag ),
219 ( "NOINLINE", ITnoinline_prag ),
220 ( "NOTINLINE", ITnoinline_prag ),
221 ( "LINE", ITline_prag ),
222 ( "RULES", ITrules_prag ),
223 ( "RULEZ", ITrules_prag ), -- american spelling :-)
224 ( "SCC", ITscc_prag ),
225 ( "DEPRECATED", ITdeprecated_prag )
228 haskellKeywordsFM = listToUFM $
229 map (\ (x,y) -> (_PK_ x,y))
230 [( "_", ITunderscore ),
233 ( "class", ITclass ),
235 ( "default", ITdefault ),
236 ( "deriving", ITderiving ),
239 ( "hiding", IThiding ),
241 ( "import", ITimport ),
243 ( "infix", ITinfix ),
244 ( "infixl", ITinfixl ),
245 ( "infixr", ITinfixr ),
246 ( "instance", ITinstance ),
248 ( "module", ITmodule ),
249 ( "newtype", ITnewtype ),
251 ( "qualified", ITqualified ),
254 ( "where", ITwhere ),
255 ( "_scc_", ITscc ) -- ToDo: remove
258 isSpecial :: Token -> Bool
259 -- If we see M.x, where x is a keyword, but
260 -- is special, we treat is as just plain M.x,
262 isSpecial ITas = True
263 isSpecial IThiding = True
264 isSpecial ITqualified = True
265 isSpecial ITforall = True
266 isSpecial ITexport = True
267 isSpecial ITlabel = True
268 isSpecial ITdynamic = True
269 isSpecial ITsafe = True
270 isSpecial ITthreadsafe = True
271 isSpecial ITunsafe = True
272 isSpecial ITwith = True
273 isSpecial ITccallconv = True
274 isSpecial ITstdcallconv = True
277 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
278 ghcExtensionKeywordsFM = listToUFM $
279 map (\ (x,y) -> (_PK_ x,y))
280 [ ( "forall", ITforall ),
281 ( "foreign", ITforeign ),
282 ( "export", ITexport ),
283 ( "label", ITlabel ),
284 ( "dynamic", ITdynamic ),
286 ( "threadsafe", ITthreadsafe ),
287 ( "unsafe", ITunsafe ),
289 ( "stdcall", ITstdcallconv),
290 ( "ccall", ITccallconv),
291 ( "dotnet", ITdotnet),
292 ("_ccall_", ITccall (False, False, PlayRisky)),
293 ("_ccall_GC_", ITccall (False, False, PlaySafe False)),
294 ("_casm_", ITccall (False, True, PlayRisky)),
295 ("_casm_GC_", ITccall (False, True, PlaySafe False))
299 haskellKeySymsFM = listToUFM $
300 map (\ (x,y) -> (_PK_ x,y))
314 ,(".", ITdot) -- sadly, for 'forall a . t'
318 -----------------------------------------------------------------------------
323 - (exts) lexing a source with extensions, eg, an interface file or
325 - (bol) pointer to beginning of line (for column calculations)
326 - (buf) pointer to beginning of token
327 - (buf) pointer to current char
328 - (atbol) flag indicating whether we're at the beginning of a line
331 lexer :: (Token -> P a) -> P a
332 lexer cont buf s@(PState{
340 -- first, start a new lexeme and lose all the whitespace
342 tab line bol atbol (stepOverLexeme buf)
344 line = srcLocLine loc
346 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
347 case currentChar# buf of
350 if bufferExhausted (stepOn buf)
351 then cont ITeof buf s'
352 else trace "lexer: misplaced NUL?" $
353 tab y bol atbol (stepOn buf)
355 '\n'# -> let buf' = stepOn buf
356 in tab (y +# 1#) (currentIndex# buf') 1# buf'
358 -- find comments. This got harder in Haskell 98.
359 '-'# -> let trundle n =
360 let next = lookAhead# buf n in
361 if next `eqChar#` '-'# then trundle (n +# 1#)
362 else if is_symbol next || n <# 2#
365 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
368 -- comments and pragmas. We deal with LINE pragmas here,
369 -- and throw out any unrecognised pragmas as comments. Any
370 -- pragmas we know about are dealt with later (after any layout
371 -- processing if necessary).
372 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
373 if lookAhead# buf 2# `eqChar#` '#'# then
374 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
375 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
376 let lexeme = mkFastString -- ToDo: too slow
377 (map toUpper (lexemeToString buf2)) in
378 case lookupUFM pragmaKeywordsFM lexeme of
379 -- ignore RULES pragmas when -fglasgow-exts is off
380 Just ITrules_prag | not (glaExtsEnabled exts) ->
381 skip_to_end (stepOnBy# buf 2#) s'
383 line_prag skip_to_end buf2 s'
384 Just other -> is_a_token
385 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
388 else skip_to_end (stepOnBy# buf 2#) s'
390 skip_to_end = skipNestedComment (lexer cont)
392 -- special GHC extension: we grok cpp-style #line pragmas
393 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
394 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
395 lookAhead# buf 2# `eqChar#` 'i'# &&
396 lookAhead# buf 3# `eqChar#` 'n'# &&
397 lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5#
398 | otherwise = stepOn buf
400 case expandWhile# is_space buf1 of { buf2 ->
401 if is_digit (currentChar# buf2)
402 then line_prag next_line buf2 s'
406 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
408 -- tabs have been expanded beforehand
409 c | is_space c -> tab y bol atbol (stepOn buf)
410 | otherwise -> is_a_token
412 where s' = s{loc = replaceSrcLine loc y,
416 is_a_token | atbol /=# 0# = lexBOL cont buf s'
417 | otherwise = lexToken cont exts buf s'
419 -- {-# LINE .. #-} pragmas. yeuch.
420 line_prag cont buf s@PState{loc=loc} =
421 case expandWhile# is_space buf of { buf1 ->
422 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
423 -- subtract one: the line number refers to the *following* line.
424 let real_line = line - 1 in
425 case fromInteger real_line of { i@(I# l) ->
426 -- ToDo, if no filename then we skip the newline.... d'oh
427 case expandWhile# is_space buf2 of { buf3 ->
428 case currentChar# buf3 of
430 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
432 file = lexemeToFastString buf4
433 new_buf = stepOn (stepOverLexeme buf4)
435 if nullFastString file
436 then cont new_buf s{loc = replaceSrcLine loc l}
437 else cont new_buf s{loc = mkSrcLoc file i}
439 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
442 skipNestedComment :: P a -> P a
443 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
445 skipNestedComment' :: SrcLoc -> P a -> P a
446 skipNestedComment' orig_loc cont buf = loop buf
449 case currentChar# buf of
450 '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
452 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
454 (skipNestedComment' orig_loc cont)
457 '\n'# -> \ s@PState{loc=loc} ->
458 let buf' = stepOn buf in
459 loop buf' s{loc = incSrcLine loc,
460 bol = currentIndex# buf',
463 -- pass the original SrcLoc to lexError so that the error is
464 -- reported at the line it was originally on, not the line at
465 -- the end of the file.
466 '\NUL'# | bufferExhausted (stepOn buf) ->
467 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
469 _ -> loop (stepOn buf)
471 -- When we are lexing the first token of a line, check whether we need to
472 -- insert virtual semicolons or close braces due to layout.
474 lexBOL :: (Token -> P a) -> P a
475 lexBOL cont buf s@(PState{
482 if need_close_curly then
483 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
484 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
485 else if need_semi_colon then
486 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
487 cont ITsemi buf s{atbol = 0#}
489 lexToken cont exts buf s{atbol = 0#}
491 col = currentIndex# buf -# bol
504 Layout n -> col ==# n
507 lexToken :: (Token -> P a) -> Int# -> P a
508 lexToken cont exts buf =
509 -- trace "lexToken" $
510 case currentChar# buf of
512 -- special symbols ----------------------------------------------------
513 '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
514 -> cont IToubxparen (setCurrentPos# buf 2#)
516 -> cont IToparen (incLexeme buf)
518 ')'# -> cont ITcparen (incLexeme buf)
519 '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
520 cont ITopabrack (setCurrentPos# buf 2#)
522 cont ITobrack (incLexeme buf)
523 ']'# -> cont ITcbrack (incLexeme buf)
524 ','# -> cont ITcomma (incLexeme buf)
525 ';'# -> cont ITsemi (incLexeme buf)
526 '}'# -> \ s@PState{context = ctx} ->
528 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
529 _ -> lexError "too many '}'s" buf s
530 '|'# -> case lookAhead# buf 1# of
531 '}'# | glaExtsEnabled exts -> cont ITccurlybar
532 (setCurrentPos# buf 2#)
533 _ -> lex_sym cont (incLexeme buf)
534 ':'# -> case lookAhead# buf 1# of
535 ']'# | parrEnabled exts -> cont ITcpabrack
536 (setCurrentPos# buf 2#)
537 _ -> lex_sym cont (incLexeme buf)
540 '#'# -> case lookAhead# buf 1# of
541 ')'# | glaExtsEnabled exts
542 -> cont ITcubxparen (setCurrentPos# buf 2#)
543 '-'# -> case lookAhead# buf 2# of
544 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
545 _ -> lex_sym cont (incLexeme buf)
546 _ -> lex_sym cont (incLexeme buf)
548 '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
549 -> lex_cstring cont (setCurrentPos# buf 2#)
551 -> cont ITbackquote (incLexeme buf)
553 '{'# -> -- for Emacs: -}
554 case lookAhead# buf 1# of
555 '|'# | glaExtsEnabled exts
556 -> cont ITocurlybar (setCurrentPos# buf 2#)
557 '-'# -> case lookAhead# buf 2# of
558 '#'# -> lex_prag cont (setCurrentPos# buf 3#)
559 _ -> cont ITocurly (incLexeme buf)
560 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
562 -- strings/characters -------------------------------------------------
563 '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
564 '\''# -> lex_char (char_end cont) exts (incLexeme buf)
566 -- Hexadecimal and octal constants
567 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
568 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
569 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
570 -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec
571 where ch = lookAhead# buf 1#
572 ch2 = lookAhead# buf 2#
573 buf' = setCurrentPos# buf 2#
576 if bufferExhausted (stepOn buf) then
579 trace "lexIface: misplaced NUL?" $
580 cont (ITunknown "\NUL") (stepOn buf)
582 '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
583 lex_ip ITdupipvarid cont (incLexeme buf)
584 '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
585 lex_ip ITsplitipvarid cont (incLexeme buf)
586 c | is_digit c -> lex_num cont exts 0 buf
587 | is_symbol c -> lex_sym cont buf
588 | is_upper c -> lex_con cont exts buf
589 | is_ident c -> lex_id cont exts buf
590 | otherwise -> lexError "illegal character" buf
592 -- Int# is unlifted, and therefore faster than Bool for flags.
598 -------------------------------------------------------------------------------
602 = case expandWhile# is_space buf of { buf1 ->
603 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
604 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
605 case lookupUFM pragmaKeywordsFM lexeme of
606 Just kw -> cont kw (mergeLexemes buf buf2)
607 Nothing -> panic "lex_prag"
610 -------------------------------------------------------------------------------
613 lex_string cont exts s buf
614 = case currentChar# buf of
616 let buf' = incLexeme buf
617 s' = mkFastStringNarrow (map chr (reverse s))
618 in case currentChar# buf' of
619 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
620 then cont (ITprimstring s') (incLexeme buf')
621 else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
622 _ -> cont (ITstring s') buf'
624 -- ignore \& in a string, deal with string gaps
625 '\\'# | next_ch `eqChar#` '&'#
626 -> lex_string cont exts s buf'
628 -> lex_stringgap cont exts s (incLexeme buf)
630 where next_ch = lookAhead# buf 1#
631 buf' = setCurrentPos# buf 2#
633 _ -> lex_char (lex_next_string cont s) exts buf
635 lex_stringgap cont exts s buf
636 = let buf' = incLexeme buf in
637 case currentChar# buf of
638 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
639 st{loc = incSrcLine loc}
640 '\\'# -> lex_string cont exts s buf'
641 c | is_space c -> lex_stringgap cont exts s buf'
642 other -> charError buf'
644 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
646 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
647 lex_char cont exts buf
648 = case currentChar# buf of
649 '\\'# -> lex_escape (cont exts) (incLexeme buf)
650 c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
651 other -> charError buf
653 char_end cont exts c buf
654 = case currentChar# buf of
655 '\''# -> let buf' = incLexeme buf in
656 case currentChar# buf' of
657 '#'# | glaExtsEnabled exts
658 -> cont (ITprimchar c) (incLexeme buf')
659 _ -> cont (ITchar c) buf'
663 = let buf' = incLexeme buf in
664 case currentChar# buf of
665 'a'# -> cont (ord '\a') buf'
666 'b'# -> cont (ord '\b') buf'
667 'f'# -> cont (ord '\f') buf'
668 'n'# -> cont (ord '\n') buf'
669 'r'# -> cont (ord '\r') buf'
670 't'# -> cont (ord '\t') buf'
671 'v'# -> cont (ord '\v') buf'
672 '\\'# -> cont (ord '\\') buf'
673 '"'# -> cont (ord '\"') buf'
674 '\''# -> cont (ord '\'') buf'
675 '^'# -> let c = currentChar# buf' in
676 if c `geChar#` '@'# && c `leChar#` '_'#
677 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
680 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
681 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
683 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
685 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
686 Just buf2 <- [prefixMatch buf p] ] of
687 (c,buf2):_ -> cont (ord c) buf2
690 after_charnum cont i buf
691 = if i >= 0 && i <= 0x10FFFF
692 then cont (fromInteger i) buf
695 readNum cont buf is_digit base conv = read buf 0
697 = case currentChar# buf of { c ->
699 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
705 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
706 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
708 hex c | is_digit c = ord# c -# ord# '0'#
709 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
710 oct_or_dec c = ord# c -# ord# '0'#
712 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
715 | c `geChar#` 'A'# && c `leChar#` 'Z'#
716 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
719 charError buf = lexError "error in character literal" buf
721 silly_escape_chars = [
758 -----------------------------------------------------------------------------
761 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
762 lex_num cont exts acc buf =
763 case scanNumLit acc buf of
765 case currentChar# buf' of
766 '.'# | is_digit (lookAhead# buf' 1#) ->
767 -- this case is not optimised at all, as the
768 -- presence of floating point numbers in interface
769 -- files is not that common. (ToDo)
770 case expandWhile# is_digit (incLexeme buf') of
771 buf2 -> -- points to first non digit char
773 let l = case currentChar# buf2 of
779 = let buf3 = incLexeme buf2 in
780 case currentChar# buf3 of
781 '-'# | is_digit (lookAhead# buf3 1#)
782 -> expandWhile# is_digit (incLexeme buf3)
783 '+'# | is_digit (lookAhead# buf3 1#)
784 -> expandWhile# is_digit (incLexeme buf3)
785 x | is_digit x -> expandWhile# is_digit buf3
788 v = readRational__ (lexemeToString l)
790 in case currentChar# l of -- glasgow exts only
791 '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
792 case currentChar# l' of
793 '#'# -> cont (ITprimdouble v) (incLexeme l')
794 _ -> cont (ITprimfloat v) l'
795 _ -> cont (ITrational v) l
797 _ -> after_lexnum cont exts acc' buf'
799 after_lexnum cont exts i buf
800 = case currentChar# buf of
801 '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
802 _ -> cont (ITinteger i) buf
804 -----------------------------------------------------------------------------
805 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
807 -- we lexemeToFastString on the bit between the ``''s, but include the
808 -- quotes in the full lexeme.
810 lex_cstring cont buf =
811 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
812 Just buf' -> cont (ITlitlit (lexemeToFastString
813 (setCurrentPos# buf' (negateInt# 2#))))
814 (mergeLexemes buf buf')
815 Nothing -> lexError "unterminated ``" buf
817 -----------------------------------------------------------------------------
818 -- identifiers, symbols etc.
820 lex_ip ip_constr cont buf =
821 case expandWhile# is_ident buf of
822 buf' -> cont (ip_constr (tailFS lexeme)) buf'
823 where lexeme = lexemeToFastString buf'
825 lex_id cont exts buf =
826 let buf1 = expandWhile# is_ident buf in
829 case (if glaExtsEnabled exts
830 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
831 else buf1) of { buf' ->
834 let lexeme = lexemeToFastString buf' in
836 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
837 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
841 let var_token = cont (ITvarid lexeme) buf' in
843 if not (glaExtsEnabled exts)
847 case lookupUFM ghcExtensionKeywordsFM lexeme of {
848 Just kwd_token -> cont kwd_token buf';
855 case expandWhile# is_symbol buf of
856 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
857 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
858 cont kwd_token buf' ;
859 Nothing -> --trace ("sym: "++unpackFS lexeme) $
860 cont (mk_var_token lexeme) buf'
862 where lexeme = lexemeToFastString buf'
865 -- lex_con recursively collects components of a qualified identifer.
866 -- The argument buf is the StringBuffer representing the lexeme
867 -- identified so far, where the next character is upper-case.
869 lex_con cont exts buf =
870 -- trace ("con: "{-++unpackFS lexeme-}) $
871 let empty_buf = stepOverLexeme buf in
872 case expandWhile# is_ident empty_buf of { buf1 ->
873 case slurp_trailing_hashes buf1 exts of { con_buf ->
875 let all_buf = mergeLexemes buf con_buf
877 con_lexeme = lexemeToFastString con_buf
878 mod_lexeme = lexemeToFastString (decLexeme buf)
879 all_lexeme = lexemeToFastString all_buf
882 | emptyLexeme buf = cont (ITconid con_lexeme) all_buf
883 | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
886 case currentChar# all_buf of
887 '.'# -> maybe_qualified cont exts all_lexeme
888 (incLexeme all_buf) just_a_conid
893 maybe_qualified cont exts mod buf just_a_conid =
894 -- trace ("qid: "{-++unpackFS lexeme-}) $
895 case currentChar# buf of
896 '['# -> -- Special case for []
897 case lookAhead# buf 1# of
898 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
901 '('# -> -- Special case for (,,,)
902 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
903 case lookAhead# buf 1# of
904 '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
905 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
908 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
909 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
912 '-'# -> case lookAhead# buf 1# of
913 '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
914 _ -> lex_id3 cont exts mod buf just_a_conid
916 _ -> lex_id3 cont exts mod buf just_a_conid
919 lex_id3 cont exts mod buf just_a_conid
920 | is_upper (currentChar# buf) =
921 lex_con cont exts buf
923 | is_symbol (currentChar# buf) =
925 start_new_lexeme = stepOverLexeme buf
927 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
928 case expandWhile# is_symbol start_new_lexeme of { buf' ->
930 lexeme = lexemeToFastString buf'
931 -- real lexeme is M.<sym>
932 new_buf = mergeLexemes buf buf'
934 cont (mk_qvar_token mod lexeme) new_buf
935 -- wrong, but arguably morally right: M... is now a qvarsym
940 start_new_lexeme = stepOverLexeme buf
942 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
943 case expandWhile# is_ident start_new_lexeme of { buf1 ->
948 case slurp_trailing_hashes buf1 exts of { buf' ->
951 lexeme = lexemeToFastString buf'
952 new_buf = mergeLexemes buf buf'
953 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
955 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
956 Nothing -> is_a_qvarid ;
958 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
959 -> is_a_qvarid -- recognised as keywords here.
961 -> just_a_conid -- avoid M.where etc.
964 slurp_trailing_hashes buf exts
965 | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
970 | is_upper f = ITconid pk_str
971 | is_ident f = ITvarid pk_str
972 | f `eqChar#` ':'# = ITconsym pk_str
973 | otherwise = ITvarsym pk_str
975 (C# f) = _HEAD_ pk_str
976 -- tl = _TAIL_ pk_str
978 mk_qvar_token m token =
979 -- trace ("mk_qvar ") $
980 case mk_var_token token of
981 ITconid n -> ITqconid (m,n)
982 ITvarid n -> ITqvarid (m,n)
983 ITconsym n -> ITqconsym (m,n)
984 ITvarsym n -> ITqvarsym (m,n)
985 _ -> ITunknown (show token)
988 ----------------------------------------------------------------------------
989 Horrible stuff for dealing with M.(,,,)
992 lex_tuple cont mod buf back_off =
996 case currentChar# buf of
997 ','# -> go (n+1) (stepOn buf)
998 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1001 lex_ubx_tuple cont mod buf back_off =
1005 case currentChar# buf of
1006 ','# -> go (n+1) (stepOn buf)
1007 '#'# -> case lookAhead# buf 1# of
1008 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1014 -----------------------------------------------------------------------------
1025 data PState = PState {
1027 extsBitmap :: Int#, -- bitmap that determines permitted extensions
1030 context :: [LayoutContext]
1033 type P a = StringBuffer -- Input string
1038 returnP a buf s = POk s a
1040 thenP :: P a -> (a -> P b) -> P b
1041 m `thenP` k = \ buf s ->
1043 POk s1 a -> k a buf s1
1044 PFailed err -> PFailed err
1046 thenP_ :: P a -> P b -> P b
1047 m `thenP_` k = m `thenP` \_ -> k
1049 mapP :: (a -> P b) -> [a] -> P [b]
1050 mapP f [] = returnP []
1053 mapP f as `thenP` \bs ->
1056 failP :: String -> P a
1057 failP msg buf s = PFailed (text msg)
1059 failMsgP :: Message -> P a
1060 failMsgP msg buf s = PFailed msg
1062 lexError :: String -> P a
1063 lexError str buf s@PState{ loc = loc }
1064 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1066 getSrcLocP :: P SrcLoc
1067 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1069 -- use a temporary SrcLoc for the duration of the argument
1070 setSrcLocP :: SrcLoc -> P a -> P a
1071 setSrcLocP new_loc p buf s =
1072 case p buf s{ loc=new_loc } of
1074 PFailed e -> PFailed e
1076 getSrcFile :: P FAST_STRING
1077 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1079 pushContext :: LayoutContext -> P ()
1080 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1084 This special case in layoutOn is to handle layout contexts with are
1085 indented the same or less than the current context. This is illegal
1086 according to the Haskell spec, so we have to arrange to close the
1087 current context. eg.
1092 after the first 'where', the sequence of events is:
1094 - layout system inserts a ';' (column 0)
1095 - parser begins a new context at column 0
1096 - parser shifts ';' (legal empty declaration)
1097 - parser sees 'class': parse error (we're still in the inner context)
1099 trouble is, by the time we know we need a new context, the lexer has
1100 already generated the ';'. Hacky solution is as follows: since we
1101 know the column of the next token (it's the column number of the new
1102 context), we set the ACTUAL column number of the new context to this
1103 numer plus one. Hence the next time the lexer is called, a '}' will
1104 be generated to close the new context straight away. Furthermore, we
1105 have to set the atbol flag so that the ';' that the parser shifted as
1106 part of the new context is re-generated.
1108 when the new context is *less* indented than the current one:
1110 f = f where g = g where
1113 - current context: column 12.
1114 - on seeing 'h' (column 0), the layout system inserts '}'
1115 - parser starts a new context, column 0
1116 - parser sees '}', uses it to close new context
1117 - we still need to insert another '}' followed by a ';',
1118 hence the atbol trick.
1120 There's also a special hack in here to deal with
1127 i.e. the inner context is at the same indentation level as the outer
1128 context. This is strictly illegal according to Haskell 98, but
1129 there's a lot of existing code using this style and it doesn't make
1130 any sense to disallow it, since empty 'do' lists don't make sense.
1133 layoutOn :: Bool -> P ()
1134 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1135 let offset = lexemeIndex buf -# bol in
1138 | if strict then prev_off >=# offset else prev_off ># offset ->
1139 --trace ("layout on, column: " ++ show (I# offset)) $
1140 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1142 --trace ("layout on, column: " ++ show (I# offset)) $
1143 POk s{ context = Layout offset : ctx } ()
1146 layoutOff buf s@(PState{ context = ctx }) =
1147 POk s{ context = NoLayout:ctx } ()
1150 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1152 (_:tl) -> POk s{ context = tl } ()
1153 [] -> PFailed (srcParseErr buf loc)
1155 -- for reasons of efficiency, flags indicating language extensions (eg,
1156 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1159 glaExtsBit, ffiBit, parrBit :: Int
1161 ffiBit = 1 -- FIXME: not used yet; still part of `glaExtsBit'
1164 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1165 glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit
1166 ffiEnabled flags = testBit (toInt32 flags) ffiBit
1167 parrEnabled flags = testBit (toInt32 flags) parrBit
1169 toInt32 :: Int# -> Int32
1170 toInt32 x# = fromIntegral (I# x#)
1172 -- convenient record-based bitmap for the interface to the rest of the world
1174 data ExtFlags = ExtFlags {
1175 glasgowExtsEF :: Bool,
1176 -- ffiEF :: Bool, -- commented out to avoid warnings
1177 parrEF :: Bool -- while not used yet
1180 -- create a parse state
1182 mkPState :: SrcLoc -> ExtFlags -> PState
1183 mkPState loc exts = PState {
1185 extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits},
1191 bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts
1192 -- .|. ffiBit `setBitIf` ffiEF exts
1193 .|. parrBit `setBitIf` parrEF exts
1195 setBitIf :: Int -> Bool -> Int32
1196 b `setBitIf` cond | cond = bit b
1199 -----------------------------------------------------------------------------
1201 srcParseErr :: StringBuffer -> SrcLoc -> Message
1205 then ptext SLIT(": parse error (possibly incorrect indentation)")
1206 else hcat [ptext SLIT(": parse error on input "),
1207 char '`', text token, char '\'']
1210 token = lexemeToString s