2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
24 Token(..), lexer, ParseResult(..), PState(..),
25 ExtFlags(..), mkPState,
28 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
29 getSrcLocP, setSrcLocP, getSrcFile,
30 layoutOn, layoutOff, pushContext, popContext
33 #include "HsVersions.h"
35 import Char ( toUpper, isDigit, chr, ord )
38 import PrelNames ( mkTupNameStr )
39 import ForeignCall ( Safety(..) )
40 import UniqFM ( listToUFM, lookupUFM )
41 import BasicTypes ( Boxity(..) )
42 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
43 replaceSrcLine, mkSrcLoc )
45 import ErrUtils ( Message )
53 import DATA_BITS ( Bits(..) )
54 import DATA_INT ( Int32 )
57 %************************************************************************
59 \subsection{Data types}
61 %************************************************************************
63 The token data type, fairly un-interesting except from one
64 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
65 strictness, unfolding etc).
67 The Idea/Observation here is that the renamer needs to scan through
68 all of an interface file before it can continue. But only a fraction
69 of the information contained in the file turns out to be useful, so
70 delaying as much as possible of the scanning and parsing of an
71 interface file Makes Sense (Heap profiles of the compiler
72 show a reduction in heap usage by at least a factor of two,
75 Hence, the interface file lexer spots when value declarations are
76 being scanned and return the @ITidinfo@ and @ITtype@ constructors
77 for the type and any other id info for that binding (unfolding, strictness
78 etc). These constructors are applied to the result of lexing these sub-chunks.
80 The lexing of the type and id info is all done lazily, of course, so
81 the scanning (and subsequent parsing) will be done *only* on the ids the
82 renamer finds out that it is interested in. The rest will just be junked.
83 Laziness, you know it makes sense :-)
87 = ITas -- Haskell keywords
111 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
113 | ITforall -- GHC extension keywords
125 | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc)
128 | ITspecialise_prag -- Pragmas
136 | ITcore_prag -- hdaume: core annotations
139 | ITdotdot -- reserved symbols
155 | ITbiglam -- GHC-extension symbols
157 | ITocurly -- special symbols
159 | ITocurlybar -- {|, for type applications
160 | ITccurlybar -- |}, for type applications
163 | ITopabrack -- [:, for parallel arrays with -fparr
164 | ITcpabrack -- :], for parallel arrays with -fparr
175 | ITvarid FastString -- identifiers
177 | ITvarsym FastString
178 | ITconsym FastString
179 | ITqvarid (FastString,FastString)
180 | ITqconid (FastString,FastString)
181 | ITqvarsym (FastString,FastString)
182 | ITqconsym (FastString,FastString)
184 | ITdupipvarid FastString -- GHC extension: implicit param: ?x
185 | ITsplitipvarid FastString -- GHC extension: implicit param: %x
187 | ITpragma StringBuffer
190 | ITstring FastString
192 | ITrational Rational
195 | ITprimstring FastString
197 | ITprimfloat Rational
198 | ITprimdouble Rational
199 | ITlitlit FastString
201 -- MetaHaskell extension tokens
202 | ITopenExpQuote -- [| or [e|
203 | ITopenPatQuote -- [p|
204 | ITopenDecQuote -- [d|
205 | ITopenTypQuote -- [t|
207 | ITidEscape FastString -- $x
208 | ITparenEscape -- $(
213 -- Arrow notation extension
220 | ITLarrowtail -- -<<
221 | ITRarrowtail -- >>-
223 | ITunknown String -- Used when the lexer can't make sense of it
224 | ITeof -- end of file token
225 deriving Show -- debugging
228 -----------------------------------------------------------------------------
232 pragmaKeywordsFM = listToUFM $
233 map (\ (x,y) -> (mkFastString x,y))
234 [( "SPECIALISE", ITspecialise_prag ),
235 ( "SPECIALIZE", ITspecialise_prag ),
236 ( "SOURCE", ITsource_prag ),
237 ( "INLINE", ITinline_prag ),
238 ( "NOINLINE", ITnoinline_prag ),
239 ( "NOTINLINE", ITnoinline_prag ),
240 ( "LINE", ITline_prag ),
241 ( "RULES", ITrules_prag ),
242 ( "RULEZ", ITrules_prag ), -- american spelling :-)
243 ( "SCC", ITscc_prag ),
244 ( "CORE", ITcore_prag ), -- hdaume: core annotation
245 ( "DEPRECATED", ITdeprecated_prag )
248 haskellKeywordsFM = listToUFM $
249 map (\ (x,y) -> (mkFastString x,y))
250 [( "_", ITunderscore ),
253 ( "class", ITclass ),
255 ( "default", ITdefault ),
256 ( "deriving", ITderiving ),
259 ( "hiding", IThiding ),
261 ( "import", ITimport ),
263 ( "infix", ITinfix ),
264 ( "infixl", ITinfixl ),
265 ( "infixr", ITinfixr ),
266 ( "instance", ITinstance ),
268 ( "module", ITmodule ),
269 ( "newtype", ITnewtype ),
271 ( "qualified", ITqualified ),
274 ( "where", ITwhere ),
275 ( "_scc_", ITscc ) -- ToDo: remove
278 isSpecial :: Token -> Bool
279 -- If we see M.x, where x is a keyword, but
280 -- is special, we treat is as just plain M.x,
282 isSpecial ITas = True
283 isSpecial IThiding = True
284 isSpecial ITqualified = True
285 isSpecial ITforall = True
286 isSpecial ITexport = True
287 isSpecial ITlabel = True
288 isSpecial ITdynamic = True
289 isSpecial ITsafe = True
290 isSpecial ITthreadsafe = True
291 isSpecial ITunsafe = True
292 isSpecial ITwith = True
293 isSpecial ITccallconv = True
294 isSpecial ITstdcallconv = True
295 isSpecial ITmdo = True
298 -- the bitmap provided as the third component indicates whether the
299 -- corresponding extension keyword is valid under the extension options
300 -- provided to the compiler; if the extension corresponding to *any* of the
301 -- bits set in the bitmap is enabled, the keyword is valid (this setup
302 -- facilitates using a keyword in two different extensions that can be
303 -- activated independently)
305 ghcExtensionKeywordsFM = listToUFM $
306 map (\(x, y, z) -> (mkFastString x, (y, z)))
307 [ ( "forall", ITforall, bit glaExtsBit),
308 ( "mdo", ITmdo, bit glaExtsBit),
309 ( "reifyDecl", ITreifyDecl, bit glaExtsBit),
310 ( "reifyType", ITreifyType, bit glaExtsBit),
311 ( "reifyFixity",ITreifyFixity, bit glaExtsBit),
313 ( "foreign", ITforeign, bit ffiBit),
314 ( "export", ITexport, bit ffiBit),
315 ( "label", ITlabel, bit ffiBit),
316 ( "dynamic", ITdynamic, bit ffiBit),
317 ( "safe", ITsafe, bit ffiBit),
318 ( "threadsafe", ITthreadsafe, bit ffiBit),
319 ( "unsafe", ITunsafe, bit ffiBit),
320 ( "stdcall", ITstdcallconv, bit ffiBit),
321 ( "ccall", ITccallconv, bit ffiBit),
322 ( "dotnet", ITdotnet, bit ffiBit),
324 ( "with", ITwith, bit withBit),
326 ( "rec", ITrec, bit arrowsBit),
327 ( "proc", ITproc, bit arrowsBit),
330 ("_ccall_", ITccall (False, False, PlayRisky),
332 ("_ccall_GC_", ITccall (False, False, PlaySafe False),
334 ("_casm_", ITccall (False, True, PlayRisky),
336 ("_casm_GC_", ITccall (False, True, PlaySafe False),
340 haskellKeySymsFM = listToUFM $
341 map (\ (x,y,z) -> (mkFastString x,(y,z)))
342 [ ("..", ITdotdot, Nothing)
343 ,(":", ITcolon, Nothing) -- (:) is a reserved op,
344 -- meaning only list cons
345 ,("::", ITdcolon, Nothing)
346 ,("=", ITequal, Nothing)
347 ,("\\", ITlam, Nothing)
348 ,("|", ITvbar, Nothing)
349 ,("<-", ITlarrow, Nothing)
350 ,("->", ITrarrow, Nothing)
351 ,("@", ITat, Nothing)
352 ,("~", ITtilde, Nothing)
353 ,("=>", ITdarrow, Nothing)
354 ,("-", ITminus, Nothing)
355 ,("!", ITbang, Nothing)
357 ,("*", ITstar, Just (bit glaExtsBit)) -- For data T (a::*) = MkT
358 ,(".", ITdot, Just (bit glaExtsBit)) -- For 'forall a . t'
360 ,("-<", ITlarrowtail, Just (bit arrowsBit))
361 ,(">-", ITrarrowtail, Just (bit arrowsBit))
362 ,("-<<", ITLarrowtail, Just (bit arrowsBit))
363 ,(">>-", ITRarrowtail, Just (bit arrowsBit))
368 -----------------------------------------------------------------------------
373 - (exts) lexing a source with extensions, eg, an interface file or
375 - (bol) pointer to beginning of line (for column calculations)
376 - (buf) pointer to beginning of token
377 - (buf) pointer to current char
378 - (atbol) flag indicating whether we're at the beginning of a line
381 lexer :: (Token -> P a) -> P a
382 lexer cont buf s@(PState{
390 -- first, start a new lexeme and lose all the whitespace
392 tab line bol atbol (stepOverLexeme buf)
394 line = srcLocLine loc
396 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
397 case currentChar# buf of
400 if bufferExhausted (stepOn buf)
401 then cont ITeof buf s'
402 else trace "lexer: misplaced NUL?" $
403 tab y bol atbol (stepOn buf)
405 '\n'# -> let buf' = stepOn buf
406 in tab (y +# 1#) (currentIndex# buf') 1# buf'
408 -- find comments. This got harder in Haskell 98.
409 '-'# -> let trundle n =
410 let next = lookAhead# buf n in
411 if next `eqChar#` '-'# then trundle (n +# 1#)
412 else if is_symbol next || n <# 2#
415 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
418 -- comments and pragmas. We deal with LINE pragmas here,
419 -- and throw out any unrecognised pragmas as comments. Any
420 -- pragmas we know about are dealt with later (after any layout
421 -- processing if necessary).
422 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
423 if lookAhead# buf 2# `eqChar#` '#'# then
424 case expandWhile# is_space (addToCurrentPos buf 3#) of { buf1->
425 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
426 let lexeme = mkFastString -- ToDo: too slow
427 (map toUpper (lexemeToString buf2)) in
428 case lookupUFM pragmaKeywordsFM lexeme of
429 -- ignore RULES pragmas when -fglasgow-exts is off
430 Just ITrules_prag | not (glaExtsEnabled exts) ->
431 skip_to_end (stepOnBy# buf 2#) s'
433 line_prag skip_to_end buf2 s'
434 Just other -> is_a_token
435 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
438 else skip_to_end (stepOnBy# buf 2#) s'
440 skip_to_end = skipNestedComment (lexer cont)
442 -- special GHC extension: we grok cpp-style #line pragmas
443 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
444 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
445 lookAhead# buf 2# `eqChar#` 'i'# &&
446 lookAhead# buf 3# `eqChar#` 'n'# &&
447 lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5#
448 | otherwise = stepOn buf
450 case expandWhile# is_space buf1 of { buf2 ->
451 if is_digit (currentChar# buf2)
452 then line_prag next_line buf2 s'
456 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
458 -- tabs have been expanded beforehand
459 c | is_space c -> tab y bol atbol (stepOn buf)
460 | otherwise -> is_a_token
462 where s' = s{loc = replaceSrcLine loc y,
466 is_a_token | atbol /=# 0# = lexBOL cont buf s'
467 | otherwise = lexToken cont exts buf s'
469 -- {-# LINE .. #-} pragmas. yeuch.
470 line_prag cont buf s@PState{loc=loc} =
471 case expandWhile# is_space buf of { buf1 ->
472 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
473 -- subtract one: the line number refers to the *following* line.
474 let real_line = line - 1 in
475 case fromInteger real_line of { i@(I# l) ->
476 -- ToDo, if no filename then we skip the newline.... d'oh
477 case expandWhile# is_space buf2 of { buf3 ->
478 case currentChar# buf3 of
480 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
482 file = lexemeToFastString buf4
483 new_buf = stepOn (stepOverLexeme buf4)
485 if nullFastString file
486 then cont new_buf s{loc = replaceSrcLine loc l}
487 else cont new_buf s{loc = mkSrcLoc file i}
489 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
492 skipNestedComment :: P a -> P a
493 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
495 skipNestedComment' :: SrcLoc -> P a -> P a
496 skipNestedComment' orig_loc cont buf = loop buf
499 case currentChar# buf of
500 '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
502 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
504 (skipNestedComment' orig_loc cont)
507 '\n'# -> \ s@PState{loc=loc} ->
508 let buf' = stepOn buf in
509 loop buf' s{loc = incSrcLine loc,
510 bol = currentIndex# buf',
513 -- pass the original SrcLoc to lexError so that the error is
514 -- reported at the line it was originally on, not the line at
515 -- the end of the file.
516 '\NUL'# | bufferExhausted (stepOn buf) ->
517 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
519 _ -> loop (stepOn buf)
521 -- When we are lexing the first token of a line, check whether we need to
522 -- insert virtual semicolons or close braces due to layout.
524 lexBOL :: (Token -> P a) -> P a
525 lexBOL cont buf s@(PState{
532 if need_close_curly then
533 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
534 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
535 else if need_semi_colon then
536 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
537 cont ITsemi buf s{atbol = 0#}
539 lexToken cont exts buf s{atbol = 0#}
541 col = currentIndex# buf -# bol
554 Layout n -> col ==# n
557 lexToken :: (Token -> P a) -> Int# -> P a
558 lexToken cont exts buf =
559 -- trace "lexToken" $
560 case currentChar# buf of
562 -- special symbols ----------------------------------------------------
563 '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'# &&
564 -- Unboxed tules: '(#' but not '(##'
565 not (lookAhead# buf 2# `eqChar#` '#'#)
566 -> cont IToubxparen (addToCurrentPos buf 2#)
567 -- Arrow notation extension: '(|' but not '(||'
568 | arrowsEnabled exts && lookAhead# buf 1# `eqChar#` '|'# &&
569 not (lookAhead# buf 2# `eqChar#` '|'#)
570 -> cont IToparenbar (addToCurrentPos buf 2#)
572 -> cont IToparen (incCurrentPos buf)
574 ')'# -> cont ITcparen (incCurrentPos buf)
575 '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
576 cont ITopabrack (addToCurrentPos buf 2#)
577 ------- MetaHaskell Extensions, looking for [| [e| [t| [p| and [d|
578 | glaExtsEnabled exts &&
579 ((lookAhead# buf 1# ) `eqChar#` '|'# ) ->
580 cont ITopenExpQuote (addToCurrentPos buf 2# )
581 | glaExtsEnabled exts &&
582 (let c = (lookAhead# buf 1# )
583 in eqChar# c 'e'# || eqChar# c 't'# || eqChar# c 'd'# || eqChar# c 'p'#) &&
584 ((lookAhead# buf 2#) `eqChar#` '|'#) ->
585 let quote 'e'# = ITopenExpQuote
586 quote 'p'# = ITopenPatQuote
587 quote 'd'# = ITopenDecQuote
588 quote 't'# = ITopenTypQuote
589 in cont (quote (lookAhead# buf 1#)) (addToCurrentPos buf 3# )
591 cont ITobrack (incCurrentPos buf)
593 ']'# -> cont ITcbrack (incCurrentPos buf)
594 ','# -> cont ITcomma (incCurrentPos buf)
595 ';'# -> cont ITsemi (incCurrentPos buf)
596 '}'# -> \ s@PState{context = ctx} ->
598 (_:ctx') -> cont ITccurly (incCurrentPos buf) s{context=ctx'}
599 _ -> lexError "too many '}'s" buf s
600 '|'# -> case lookAhead# buf 1# of
601 '}'# | glaExtsEnabled exts -> cont ITccurlybar
602 (addToCurrentPos buf 2#)
603 -- MetaHaskell extension
604 ']'# | glaExtsEnabled exts -> cont ITcloseQuote (addToCurrentPos buf 2#)
605 -- arrow notation extension
606 ')'# | arrowsEnabled exts -> cont ITcparenbar
607 (addToCurrentPos buf 2#)
608 other -> lex_sym cont exts (incCurrentPos buf)
609 ':'# -> case lookAhead# buf 1# of
610 ']'# | parrEnabled exts -> cont ITcpabrack
611 (addToCurrentPos buf 2#)
612 _ -> lex_sym cont exts (incCurrentPos buf)
615 '#'# -> case lookAhead# buf 1# of
616 ')'# | glaExtsEnabled exts
617 -> cont ITcubxparen (addToCurrentPos buf 2#)
618 '-'# -> case lookAhead# buf 2# of
619 '}'# -> cont ITclose_prag (addToCurrentPos buf 3#)
620 _ -> lex_sym cont exts (incCurrentPos buf)
621 _ -> lex_sym cont exts (incCurrentPos buf)
623 '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
624 -> lex_cstring cont (addToCurrentPos buf 2#)
626 -> cont ITbackquote (incCurrentPos buf)
628 '{'# -> -- for Emacs: -}
629 case lookAhead# buf 1# of
630 '|'# | glaExtsEnabled exts
631 -> cont ITocurlybar (addToCurrentPos buf 2#)
632 '-'# -> case lookAhead# buf 2# of
633 '#'# -> lex_prag cont (addToCurrentPos buf 3#)
634 _ -> cont ITocurly (incCurrentPos buf)
635 _ -> (layoutOff `thenP_` cont ITocurly) (incCurrentPos buf)
640 -- strings/characters -------------------------------------------------
641 '\"'#{-"-} -> lex_string cont exts [] (incCurrentPos buf)
642 '\''# -> lex_char (char_end cont) exts (incCurrentPos buf)
644 -- Hexadecimal and octal constants
645 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
646 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
647 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
648 -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec
649 where ch = lookAhead# buf 1#
650 ch2 = lookAhead# buf 2#
651 buf' = addToCurrentPos buf 2#
654 if bufferExhausted (stepOn buf) then
657 trace "lexIface: misplaced NUL?" $
658 cont (ITunknown "\NUL") (stepOn buf)
660 '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) -> -- ?x implicit parameter
661 specialPrefixId ITdupipvarid cont exts (incCurrentPos buf)
662 '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
663 specialPrefixId ITsplitipvarid cont exts (incCurrentPos buf)
665 ---------------- MetaHaskell Extensions for quotation escape
666 '$'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) -> -- $x variable escape
667 specialPrefixId ITidEscape cont exts (addToCurrentPos buf 1#)
668 '$'# | glaExtsEnabled exts && -- $( f x ) expression escape
669 ((lookAhead# buf 1#) `eqChar#` '('#) -> cont ITparenEscape (addToCurrentPos buf 2#)
671 c | is_digit c -> lex_num cont exts 0 buf
672 | is_symbol c -> lex_sym cont exts buf
673 | is_upper c -> lex_con cont exts buf
674 | is_lower c -> lex_id cont exts buf
675 | otherwise -> lexError "illegal character" buf
677 -- Int# is unlifted, and therefore faster than Bool for flags.
683 -------------------------------------------------------------------------------
687 = case expandWhile# is_space buf of { buf1 ->
688 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
689 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
690 case lookupUFM pragmaKeywordsFM lexeme of
691 Just kw -> cont kw (mergeLexemes buf buf2)
692 Nothing -> panic "lex_prag"
695 -------------------------------------------------------------------------------
698 lex_string cont exts s buf
699 = case currentChar# buf of
701 let buf' = incCurrentPos buf
702 in case currentChar# buf' of
703 '#'# | glaExtsEnabled exts ->
705 then lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
706 else let s' = mkFastStringNarrow (map chr (reverse s)) in
707 -- always a narrow string/byte array
708 cont (ITprimstring s') (incCurrentPos buf')
710 _other -> let s' = mkFastString (map chr (reverse s))
711 in cont (ITstring s') buf'
713 -- ignore \& in a string, deal with string gaps
714 '\\'# | next_ch `eqChar#` '&'#
715 -> lex_string cont exts s buf'
717 -> lex_stringgap cont exts s (incCurrentPos buf)
719 where next_ch = lookAhead# buf 1#
720 buf' = addToCurrentPos buf 2#
722 _ -> lex_char (lex_next_string cont s) exts buf
724 lex_stringgap cont exts s buf
725 = let buf' = incCurrentPos buf in
726 case currentChar# buf of
727 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
728 st{loc = incSrcLine loc}
729 '\\'# -> lex_string cont exts s buf'
730 c | is_space c -> lex_stringgap cont exts s buf'
731 other -> charError buf'
733 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
735 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
736 lex_char cont exts buf
737 = case currentChar# buf of
738 '\\'# -> lex_escape (cont exts) (incCurrentPos buf)
739 c | is_any c -> cont exts (I# (ord# c)) (incCurrentPos buf)
740 other -> charError buf
742 char_end cont exts c buf
743 = case currentChar# buf of
744 '\''# -> let buf' = incCurrentPos buf in
745 case currentChar# buf' of
746 '#'# | glaExtsEnabled exts
747 -> cont (ITprimchar c) (incCurrentPos buf')
748 _ -> cont (ITchar c) buf'
752 = let buf' = incCurrentPos buf in
753 case currentChar# buf of
754 'a'# -> cont (ord '\a') buf'
755 'b'# -> cont (ord '\b') buf'
756 'f'# -> cont (ord '\f') buf'
757 'n'# -> cont (ord '\n') buf'
758 'r'# -> cont (ord '\r') buf'
759 't'# -> cont (ord '\t') buf'
760 'v'# -> cont (ord '\v') buf'
761 '\\'# -> cont (ord '\\') buf'
762 '"'# -> cont (ord '\"') buf'
763 '\''# -> cont (ord '\'') buf'
764 '^'# -> let c = currentChar# buf' in
765 if c `geChar#` '@'# && c `leChar#` '_'#
766 then cont (I# (ord# c -# ord# '@'#)) (incCurrentPos buf')
769 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
770 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
772 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
774 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
775 Just buf2 <- [prefixMatch buf p] ] of
776 (c,buf2):_ -> cont (ord c) buf2
779 after_charnum cont i buf
780 = if i >= 0 && i <= 0x10FFFF
781 then cont (fromInteger i) buf
784 readNum cont buf is_digit base conv = read buf 0
786 = case currentChar# buf of { c ->
788 then read (incCurrentPos buf) (i*base + (toInteger (I# (conv c))))
794 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
795 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
797 hex c | is_digit c = ord# c -# ord# '0'#
798 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
799 oct_or_dec c = ord# c -# ord# '0'#
801 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
804 | c `geChar#` 'A'# && c `leChar#` 'Z'#
805 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
808 charError buf = lexError "error in character literal" buf
810 silly_escape_chars = [
847 -----------------------------------------------------------------------------
850 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
851 lex_num cont exts acc buf =
852 case scanNumLit acc buf of
854 case currentChar# buf' of
855 '.'# | is_digit (lookAhead# buf' 1#) ->
856 -- this case is not optimised at all, as the
857 -- presence of floating point numbers in interface
858 -- files is not that common. (ToDo)
859 case expandWhile# is_digit (incCurrentPos buf') of
860 buf2 -> -- points to first non digit char
861 case currentChar# buf2 of
862 'E'# -> float_exponent cont exts buf2
863 'e'# -> float_exponent cont exts buf2
864 _ -> float_done cont exts buf2
866 -- numbers like '9e4' are floats
867 'E'# -> float_exponent cont exts buf'
868 'e'# -> float_exponent cont exts buf'
869 _ -> after_lexnum cont exts acc' buf' -- it's an integer
871 float_exponent cont exts buf2 =
872 let buf3 = incCurrentPos buf2
873 buf4 = case currentChar# buf3 of
874 '-'# | is_digit (lookAhead# buf3 1#)
875 -> expandWhile# is_digit (incCurrentPos buf3)
876 '+'# | is_digit (lookAhead# buf3 1#)
877 -> expandWhile# is_digit (incCurrentPos buf3)
878 x | is_digit x -> expandWhile# is_digit buf3
881 float_done cont exts buf4
883 float_done cont exts buf =
884 case currentChar# buf of -- glasgow exts only
885 '#'# | glaExtsEnabled exts ->
886 let buf' = incCurrentPos buf in
887 case currentChar# buf' of
888 '#'# -> cont (ITprimdouble v) (incCurrentPos buf')
889 _ -> cont (ITprimfloat v) buf'
890 _ -> cont (ITrational v) buf
892 v = readRational__ (lexemeToString buf)
894 after_lexnum cont exts i buf
895 = case currentChar# buf of
896 '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incCurrentPos buf)
897 _ -> cont (ITinteger i) buf
899 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
903 return ((n%1)*10^^(k-d), t)
906 (ds,s) <- lexDecDigits r
907 (ds',t) <- lexDotDigits s
908 return (read (ds++ds'), length ds', t)
910 readExp (e:s) | e `elem` "eE" = readExp' s
911 readExp s = return (0,s)
913 readExp' ('+':s) = readDec s
914 readExp' ('-':s) = do
917 readExp' s = readDec s
920 (ds,r) <- nonnull isDigit s
921 return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
924 lexDecDigits = nonnull isDigit
926 lexDotDigits ('.':s) = return (span isDigit s)
927 lexDotDigits s = return ("",s)
929 nonnull p s = do (cs@(_:_),t) <- return (span p s)
932 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
935 '-' : xs -> - (read_me xs)
939 = case (do { (x,"") <- readRational s ; return x }) of
941 [] -> error ("readRational__: no parse:" ++ top_s)
942 _ -> error ("readRational__: ambiguous parse:" ++ top_s)
944 -----------------------------------------------------------------------------
945 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
947 -- we lexemeToFastString on the bit between the ``''s, but include the
948 -- quotes in the full lexeme.
950 lex_cstring cont buf =
951 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
952 Just buf' -> cont (ITlitlit (lexemeToFastString
953 (addToCurrentPos buf' (negateInt# 2#))))
954 (mergeLexemes buf buf')
955 Nothing -> lexError "unterminated ``" buf
957 -----------------------------------------------------------------------------
958 -- identifiers, symbols etc.
960 -- used for identifiers with special prefixes like
961 -- ?x (implicit parameters), $x (MetaHaskell escapes) and #x
962 -- we've already seen the prefix char, so look for an id, and wrap
963 -- the new "ip_constr" around the lexeme returned
965 specialPrefixId ip_constr cont exts buf = lex_id newcont exts buf
966 where newcont (ITvarid lexeme) buf2 = cont (ip_constr (tailFS lexeme)) buf2
967 newcont token buf2 = cont token buf2
969 case expandWhile# is_ident buf of
970 buf' -> cont (ip_constr (tailFS lexeme)) buf'
971 where lexeme = lexemeToFastString buf'
974 lex_id cont exts buf =
975 let buf1 = expandWhile# is_ident buf in
978 case (if glaExtsEnabled exts
979 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
980 else buf1) of { buf' ->
983 let lexeme = lexemeToFastString buf' in
985 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
986 Just kwd_token -> --trace ("hkeywd: "++unpackFS(lexeme)) $
990 let var_token = cont (ITvarid lexeme) buf' in
992 case lookupUFM ghcExtensionKeywordsFM lexeme of {
993 Just (kwd_token, validExts)
994 | validExts .&. (toInt32 exts) /= 0 -> cont kwd_token buf';
999 lex_sym cont exts buf =
1000 -- trace "lex_sym" $
1001 case expandWhile# is_symbol buf of
1002 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
1003 Just (kwd_token, Nothing)
1004 -> cont kwd_token buf' ;
1005 Just (kwd_token, Just validExts)
1006 | validExts .&. toInt32 exts /= 0
1007 -> cont kwd_token buf' ;
1008 other -> cont (mk_var_token lexeme) buf'
1010 where lexeme = lexemeToFastString buf'
1013 -- lex_con recursively collects components of a qualified identifer.
1014 -- The argument buf is the StringBuffer representing the lexeme
1015 -- identified so far, where the next character is upper-case.
1017 lex_con cont exts buf =
1018 -- trace ("con: "{-++unpackFS lexeme-}) $
1019 let empty_buf = stepOverLexeme buf in
1020 case expandWhile# is_ident empty_buf of { buf1 ->
1021 case slurp_trailing_hashes buf1 exts of { con_buf ->
1023 let all_buf = mergeLexemes buf con_buf
1025 con_lexeme = lexemeToFastString con_buf
1026 mod_lexeme = lexemeToFastString (decCurrentPos buf)
1027 all_lexeme = lexemeToFastString all_buf
1030 | emptyLexeme buf = cont (ITconid con_lexeme) all_buf
1031 | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1034 case currentChar# all_buf of
1035 '.'# -> maybe_qualified cont exts all_lexeme
1036 (incCurrentPos all_buf) just_a_conid
1041 maybe_qualified cont exts mod buf just_a_conid =
1042 -- trace ("qid: "{-++unpackFS lexeme-}) $
1043 case currentChar# buf of
1044 '['# -> -- Special case for []
1045 case lookAhead# buf 1# of
1046 ']'# -> cont (ITqconid (mod,FSLIT("[]"))) (addToCurrentPos buf 2#)
1049 '('# -> -- Special case for (,,,)
1050 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1051 case lookAhead# buf 1# of
1052 '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1053 ','# -> lex_ubx_tuple cont mod (addToCurrentPos buf 3#)
1056 ')'# -> cont (ITqconid (mod,FSLIT("()"))) (addToCurrentPos buf 2#)
1057 ','# -> lex_tuple cont mod (addToCurrentPos buf 2#) just_a_conid
1060 '-'# -> case lookAhead# buf 1# of
1061 '>'# -> cont (ITqconid (mod,FSLIT("(->)"))) (addToCurrentPos buf 2#)
1062 _ -> lex_id3 cont exts mod buf just_a_conid
1064 _ -> lex_id3 cont exts mod buf just_a_conid
1067 lex_id3 cont exts mod buf just_a_conid
1068 | is_upper (currentChar# buf) =
1069 lex_con cont exts buf
1071 | is_symbol (currentChar# buf) =
1073 start_new_lexeme = stepOverLexeme buf
1075 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1076 case expandWhile# is_symbol start_new_lexeme of { buf' ->
1078 lexeme = lexemeToFastString buf'
1079 -- real lexeme is M.<sym>
1080 new_buf = mergeLexemes buf buf'
1082 cont (mk_qvar_token mod lexeme) new_buf
1083 -- wrong, but arguably morally right: M... is now a qvarsym
1088 start_new_lexeme = stepOverLexeme buf
1090 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1091 case expandWhile# is_ident start_new_lexeme of { buf1 ->
1096 case slurp_trailing_hashes buf1 exts of { buf' ->
1099 lexeme = lexemeToFastString buf'
1100 new_buf = mergeLexemes buf buf'
1101 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1103 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1104 Nothing -> is_a_qvarid ;
1106 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
1107 -> is_a_qvarid -- recognised as keywords here.
1109 -> just_a_conid -- avoid M.where etc.
1112 slurp_trailing_hashes buf exts
1113 | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1118 | is_upper f = ITconid pk_str
1119 | is_ident f = ITvarid pk_str
1120 | f `eqChar#` ':'# = ITconsym pk_str
1121 | otherwise = ITvarsym pk_str
1123 (C# f) = headFS pk_str
1124 -- tl = _TAIL_ pk_str
1126 mk_qvar_token m token =
1127 -- trace ("mk_qvar ") $
1128 case mk_var_token token of
1129 ITconid n -> ITqconid (m,n)
1130 ITvarid n -> ITqvarid (m,n)
1131 ITconsym n -> ITqconsym (m,n)
1132 ITvarsym n -> ITqvarsym (m,n)
1133 _ -> ITunknown (show token)
1136 ----------------------------------------------------------------------------
1137 Horrible stuff for dealing with M.(,,,)
1140 lex_tuple cont mod buf back_off =
1144 case currentChar# buf of
1145 ','# -> go (n+1) (stepOn buf)
1146 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1149 lex_ubx_tuple cont mod buf back_off =
1153 case currentChar# buf of
1154 ','# -> go (n+1) (stepOn buf)
1155 '#'# -> case lookAhead# buf 1# of
1156 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1162 -----------------------------------------------------------------------------
1173 data PState = PState {
1175 extsBitmap :: Int#, -- bitmap that determines permitted extensions
1178 context :: [LayoutContext]
1181 type P a = StringBuffer -- Input string
1186 returnP a buf s = POk s a
1188 thenP :: P a -> (a -> P b) -> P b
1189 m `thenP` k = \ buf s ->
1191 POk s1 a -> k a buf s1
1192 PFailed err -> PFailed err
1194 thenP_ :: P a -> P b -> P b
1195 m `thenP_` k = m `thenP` \_ -> k
1197 mapP :: (a -> P b) -> [a] -> P [b]
1198 mapP f [] = returnP []
1201 mapP f as `thenP` \bs ->
1204 failP :: String -> P a
1205 failP msg buf s = PFailed (text msg)
1207 failMsgP :: Message -> P a
1208 failMsgP msg buf s = PFailed msg
1210 lexError :: String -> P a
1211 lexError str buf s@PState{ loc = loc }
1212 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1214 getSrcLocP :: P SrcLoc
1215 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1217 -- use a temporary SrcLoc for the duration of the argument
1218 setSrcLocP :: SrcLoc -> P a -> P a
1219 setSrcLocP new_loc p buf s =
1220 case p buf s{ loc=new_loc } of
1222 PFailed e -> PFailed e
1224 getSrcFile :: P FastString
1225 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1227 pushContext :: LayoutContext -> P ()
1228 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1232 This special case in layoutOn is to handle layout contexts with are
1233 indented the same or less than the current context. This is illegal
1234 according to the Haskell spec, so we have to arrange to close the
1235 current context. eg.
1240 after the first 'where', the sequence of events is:
1242 - layout system inserts a ';' (column 0)
1243 - parser begins a new context at column 0
1244 - parser shifts ';' (legal empty declaration)
1245 - parser sees 'class': parse error (we're still in the inner context)
1247 trouble is, by the time we know we need a new context, the lexer has
1248 already generated the ';'. Hacky solution is as follows: since we
1249 know the column of the next token (it's the column number of the new
1250 context), we set the ACTUAL column number of the new context to this
1251 numer plus one. Hence the next time the lexer is called, a '}' will
1252 be generated to close the new context straight away. Furthermore, we
1253 have to set the atbol flag so that the ';' that the parser shifted as
1254 part of the new context is re-generated.
1256 when the new context is *less* indented than the current one:
1258 f = f where g = g where
1261 - current context: column 12.
1262 - on seeing 'h' (column 0), the layout system inserts '}'
1263 - parser starts a new context, column 0
1264 - parser sees '}', uses it to close new context
1265 - we still need to insert another '}' followed by a ';',
1266 hence the atbol trick.
1268 There's also a special hack in here to deal with
1275 i.e. the inner context is at the same indentation level as the outer
1276 context. This is strictly illegal according to Haskell 98, but
1277 there's a lot of existing code using this style and it doesn't make
1278 any sense to disallow it, since empty 'do' lists don't make sense.
1281 layoutOn :: Bool -> P ()
1282 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1283 let offset = lexemeIndex buf -# bol in
1286 | if strict then prev_off >=# offset else prev_off ># offset ->
1287 --trace ("layout on, column: " ++ show (I# offset)) $
1288 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1290 --trace ("layout on, column: " ++ show (I# offset)) $
1291 POk s{ context = Layout offset : ctx } ()
1294 layoutOff buf s@(PState{ context = ctx }) =
1295 POk s{ context = NoLayout:ctx } ()
1298 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1300 (_:tl) -> POk s{ context = tl } ()
1301 [] -> PFailed (srcParseErr buf loc)
1303 -- for reasons of efficiency, flags indicating language extensions (eg,
1304 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1307 glaExtsBit, ffiBit, parrBit :: Int
1314 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1315 glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit
1316 ffiEnabled flags = testBit (toInt32 flags) ffiBit
1317 withEnabled flags = testBit (toInt32 flags) withBit
1318 parrEnabled flags = testBit (toInt32 flags) parrBit
1319 arrowsEnabled flags = testBit (toInt32 flags) arrowsBit
1321 toInt32 :: Int# -> Int32
1322 toInt32 x# = fromIntegral (I# x#)
1324 -- convenient record-based bitmap for the interface to the rest of the world
1326 -- NB: `glasgowExtsEF' implies `ffiEF' (see `mkPState' below)
1328 data ExtFlags = ExtFlags {
1329 glasgowExtsEF :: Bool,
1336 -- create a parse state
1338 mkPState :: SrcLoc -> ExtFlags -> PState
1342 extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits},
1348 bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts
1349 .|. ffiBit `setBitIf` (ffiEF exts
1350 || glasgowExtsEF exts)
1351 .|. withBit `setBitIf` withEF exts
1352 .|. parrBit `setBitIf` parrEF exts
1353 .|. arrowsBit `setBitIf` arrowsEF exts
1355 setBitIf :: Int -> Bool -> Int32
1356 b `setBitIf` cond | cond = bit b
1359 -----------------------------------------------------------------------------
1361 srcParseErr :: StringBuffer -> SrcLoc -> Message
1365 then ptext SLIT(": parse error (possibly incorrect indentation)")
1366 else hcat [ptext SLIT(": parse error on input "),
1367 char '`', text token, char '\'']
1370 token = lexemeToString s