2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
24 Token(..), lexer, ParseResult(..), PState(..),
25 ExtFlags(..), mkPState,
28 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
29 getSrcLocP, setSrcLocP, getSrcFile,
30 layoutOn, layoutOff, pushContext, popContext
33 #include "HsVersions.h"
35 import Char ( toUpper, isDigit, chr, ord )
38 import PrelNames ( mkTupNameStr )
39 import ForeignCall ( Safety(..) )
40 import UniqFM ( listToUFM, lookupUFM )
41 import BasicTypes ( Boxity(..) )
42 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
43 replaceSrcLine, mkSrcLoc )
45 import ErrUtils ( Message )
53 import Bits ( Bits(..) ) -- non-std
57 %************************************************************************
59 \subsection{Data types}
61 %************************************************************************
63 The token data type, fairly un-interesting except from one
64 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
65 strictness, unfolding etc).
67 The Idea/Observation here is that the renamer needs to scan through
68 all of an interface file before it can continue. But only a fraction
69 of the information contained in the file turns out to be useful, so
70 delaying as much as possible of the scanning and parsing of an
71 interface file Makes Sense (Heap profiles of the compiler
72 show a reduction in heap usage by at least a factor of two,
75 Hence, the interface file lexer spots when value declarations are
76 being scanned and return the @ITidinfo@ and @ITtype@ constructors
77 for the type and any other id info for that binding (unfolding, strictness
78 etc). These constructors are applied to the result of lexing these sub-chunks.
80 The lexing of the type and id info is all done lazily, of course, so
81 the scanning (and subsequent parsing) will be done *only* on the ids the
82 renamer finds out that it is interested in. The rest will just be junked.
83 Laziness, you know it makes sense :-)
87 = ITas -- Haskell keywords
111 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
113 | ITforall -- GHC extension keywords
125 | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc)
127 | ITspecialise_prag -- Pragmas
137 | ITdotdot -- reserved symbols
152 | ITbiglam -- GHC-extension symbols
154 | ITocurly -- special symbols
156 | ITocurlybar -- {|, for type applications
157 | ITccurlybar -- |}, for type applications
160 | ITopabrack -- [:, for parallel arrays with -fparr
161 | ITcpabrack -- :], for parallel arrays with -fparr
172 | ITvarid FastString -- identifiers
174 | ITvarsym FastString
175 | ITconsym FastString
176 | ITqvarid (FastString,FastString)
177 | ITqconid (FastString,FastString)
178 | ITqvarsym (FastString,FastString)
179 | ITqconsym (FastString,FastString)
181 | ITdupipvarid FastString -- GHC extension: implicit param: ?x
182 | ITsplitipvarid FastString -- GHC extension: implicit param: %x
184 | ITpragma StringBuffer
187 | ITstring FastString
189 | ITrational Rational
192 | ITprimstring FastString
194 | ITprimfloat Rational
195 | ITprimdouble Rational
196 | ITlitlit FastString
198 | ITunknown String -- Used when the lexer can't make sense of it
199 | ITeof -- end of file token
200 deriving Show -- debugging
203 -----------------------------------------------------------------------------
207 pragmaKeywordsFM = listToUFM $
208 map (\ (x,y) -> (mkFastString x,y))
209 [( "SPECIALISE", ITspecialise_prag ),
210 ( "SPECIALIZE", ITspecialise_prag ),
211 ( "SOURCE", ITsource_prag ),
212 ( "INLINE", ITinline_prag ),
213 ( "NOINLINE", ITnoinline_prag ),
214 ( "NOTINLINE", ITnoinline_prag ),
215 ( "LINE", ITline_prag ),
216 ( "RULES", ITrules_prag ),
217 ( "RULEZ", ITrules_prag ), -- american spelling :-)
218 ( "SCC", ITscc_prag ),
219 ( "DEPRECATED", ITdeprecated_prag )
222 haskellKeywordsFM = listToUFM $
223 map (\ (x,y) -> (mkFastString x,y))
224 [( "_", ITunderscore ),
227 ( "class", ITclass ),
229 ( "default", ITdefault ),
230 ( "deriving", ITderiving ),
233 ( "hiding", IThiding ),
235 ( "import", ITimport ),
237 ( "infix", ITinfix ),
238 ( "infixl", ITinfixl ),
239 ( "infixr", ITinfixr ),
240 ( "instance", ITinstance ),
242 ( "module", ITmodule ),
243 ( "newtype", ITnewtype ),
245 ( "qualified", ITqualified ),
248 ( "where", ITwhere ),
249 ( "_scc_", ITscc ) -- ToDo: remove
252 isSpecial :: Token -> Bool
253 -- If we see M.x, where x is a keyword, but
254 -- is special, we treat is as just plain M.x,
256 isSpecial ITas = True
257 isSpecial IThiding = True
258 isSpecial ITqualified = True
259 isSpecial ITforall = True
260 isSpecial ITexport = True
261 isSpecial ITlabel = True
262 isSpecial ITdynamic = True
263 isSpecial ITsafe = True
264 isSpecial ITthreadsafe = True
265 isSpecial ITunsafe = True
266 isSpecial ITwith = True
267 isSpecial ITccallconv = True
268 isSpecial ITstdcallconv = True
271 -- the bitmap provided as the third component indicates whether the
272 -- corresponding extension keyword is valid under the extension options
273 -- provided to the compiler; if the extension corresponding to *any* of the
274 -- bits set in the bitmap is enabled, the keyword is valid (this setup
275 -- facilitates using a keyword in two different extensions that can be
276 -- activated independently)
278 ghcExtensionKeywordsFM = listToUFM $
279 map (\(x, y, z) -> (mkFastString x, (y, z)))
280 [ ( "forall", ITforall, bit glaExtsBit),
281 ( "foreign", ITforeign, bit ffiBit),
282 ( "export", ITexport, bit ffiBit),
283 ( "label", ITlabel, bit ffiBit),
284 ( "dynamic", ITdynamic, bit ffiBit),
285 ( "safe", ITsafe, bit ffiBit),
286 ( "threadsafe", ITthreadsafe, bit ffiBit),
287 ( "unsafe", ITunsafe, bit ffiBit),
288 ( "with", ITwith, bit withBit),
289 ( "stdcall", ITstdcallconv, bit ffiBit),
290 ( "ccall", ITccallconv, bit ffiBit),
291 ( "dotnet", ITdotnet, bit ffiBit),
292 ("_ccall_", ITccall (False, False, PlayRisky),
294 ("_ccall_GC_", ITccall (False, False, PlaySafe False),
296 ("_casm_", ITccall (False, True, PlayRisky),
298 ("_casm_GC_", ITccall (False, True, PlaySafe False),
302 haskellKeySymsFM = listToUFM $
303 map (\ (x,y) -> (mkFastString x,y))
317 ,(".", ITdot) -- sadly, for 'forall a . t'
321 -----------------------------------------------------------------------------
326 - (exts) lexing a source with extensions, eg, an interface file or
328 - (bol) pointer to beginning of line (for column calculations)
329 - (buf) pointer to beginning of token
330 - (buf) pointer to current char
331 - (atbol) flag indicating whether we're at the beginning of a line
334 lexer :: (Token -> P a) -> P a
335 lexer cont buf s@(PState{
343 -- first, start a new lexeme and lose all the whitespace
345 tab line bol atbol (stepOverLexeme buf)
347 line = srcLocLine loc
349 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
350 case currentChar# buf of
353 if bufferExhausted (stepOn buf)
354 then cont ITeof buf s'
355 else trace "lexer: misplaced NUL?" $
356 tab y bol atbol (stepOn buf)
358 '\n'# -> let buf' = stepOn buf
359 in tab (y +# 1#) (currentIndex# buf') 1# buf'
361 -- find comments. This got harder in Haskell 98.
362 '-'# -> let trundle n =
363 let next = lookAhead# buf n in
364 if next `eqChar#` '-'# then trundle (n +# 1#)
365 else if is_symbol next || n <# 2#
368 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
371 -- comments and pragmas. We deal with LINE pragmas here,
372 -- and throw out any unrecognised pragmas as comments. Any
373 -- pragmas we know about are dealt with later (after any layout
374 -- processing if necessary).
375 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
376 if lookAhead# buf 2# `eqChar#` '#'# then
377 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
378 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
379 let lexeme = mkFastString -- ToDo: too slow
380 (map toUpper (lexemeToString buf2)) in
381 case lookupUFM pragmaKeywordsFM lexeme of
382 -- ignore RULES pragmas when -fglasgow-exts is off
383 Just ITrules_prag | not (glaExtsEnabled exts) ->
384 skip_to_end (stepOnBy# buf 2#) s'
386 line_prag skip_to_end buf2 s'
387 Just other -> is_a_token
388 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
391 else skip_to_end (stepOnBy# buf 2#) s'
393 skip_to_end = skipNestedComment (lexer cont)
395 -- special GHC extension: we grok cpp-style #line pragmas
396 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
397 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
398 lookAhead# buf 2# `eqChar#` 'i'# &&
399 lookAhead# buf 3# `eqChar#` 'n'# &&
400 lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5#
401 | otherwise = stepOn buf
403 case expandWhile# is_space buf1 of { buf2 ->
404 if is_digit (currentChar# buf2)
405 then line_prag next_line buf2 s'
409 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
411 -- tabs have been expanded beforehand
412 c | is_space c -> tab y bol atbol (stepOn buf)
413 | otherwise -> is_a_token
415 where s' = s{loc = replaceSrcLine loc y,
419 is_a_token | atbol /=# 0# = lexBOL cont buf s'
420 | otherwise = lexToken cont exts buf s'
422 -- {-# LINE .. #-} pragmas. yeuch.
423 line_prag cont buf s@PState{loc=loc} =
424 case expandWhile# is_space buf of { buf1 ->
425 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
426 -- subtract one: the line number refers to the *following* line.
427 let real_line = line - 1 in
428 case fromInteger real_line of { i@(I# l) ->
429 -- ToDo, if no filename then we skip the newline.... d'oh
430 case expandWhile# is_space buf2 of { buf3 ->
431 case currentChar# buf3 of
433 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
435 file = lexemeToFastString buf4
436 new_buf = stepOn (stepOverLexeme buf4)
438 if nullFastString file
439 then cont new_buf s{loc = replaceSrcLine loc l}
440 else cont new_buf s{loc = mkSrcLoc file i}
442 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
445 skipNestedComment :: P a -> P a
446 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
448 skipNestedComment' :: SrcLoc -> P a -> P a
449 skipNestedComment' orig_loc cont buf = loop buf
452 case currentChar# buf of
453 '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
455 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
457 (skipNestedComment' orig_loc cont)
460 '\n'# -> \ s@PState{loc=loc} ->
461 let buf' = stepOn buf in
462 loop buf' s{loc = incSrcLine loc,
463 bol = currentIndex# buf',
466 -- pass the original SrcLoc to lexError so that the error is
467 -- reported at the line it was originally on, not the line at
468 -- the end of the file.
469 '\NUL'# | bufferExhausted (stepOn buf) ->
470 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
472 _ -> loop (stepOn buf)
474 -- When we are lexing the first token of a line, check whether we need to
475 -- insert virtual semicolons or close braces due to layout.
477 lexBOL :: (Token -> P a) -> P a
478 lexBOL cont buf s@(PState{
485 if need_close_curly then
486 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
487 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
488 else if need_semi_colon then
489 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
490 cont ITsemi buf s{atbol = 0#}
492 lexToken cont exts buf s{atbol = 0#}
494 col = currentIndex# buf -# bol
507 Layout n -> col ==# n
510 lexToken :: (Token -> P a) -> Int# -> P a
511 lexToken cont exts buf =
512 -- trace "lexToken" $
513 case currentChar# buf of
515 -- special symbols ----------------------------------------------------
516 '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
517 -> cont IToubxparen (setCurrentPos# buf 2#)
519 -> cont IToparen (incLexeme buf)
521 ')'# -> cont ITcparen (incLexeme buf)
522 '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
523 cont ITopabrack (setCurrentPos# buf 2#)
525 cont ITobrack (incLexeme buf)
526 ']'# -> cont ITcbrack (incLexeme buf)
527 ','# -> cont ITcomma (incLexeme buf)
528 ';'# -> cont ITsemi (incLexeme buf)
529 '}'# -> \ s@PState{context = ctx} ->
531 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
532 _ -> lexError "too many '}'s" buf s
533 '|'# -> case lookAhead# buf 1# of
534 '}'# | glaExtsEnabled exts -> cont ITccurlybar
535 (setCurrentPos# buf 2#)
536 _ -> lex_sym cont (incLexeme buf)
537 ':'# -> case lookAhead# buf 1# of
538 ']'# | parrEnabled exts -> cont ITcpabrack
539 (setCurrentPos# buf 2#)
540 _ -> lex_sym cont (incLexeme buf)
543 '#'# -> case lookAhead# buf 1# of
544 ')'# | glaExtsEnabled exts
545 -> cont ITcubxparen (setCurrentPos# buf 2#)
546 '-'# -> case lookAhead# buf 2# of
547 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
548 _ -> lex_sym cont (incLexeme buf)
549 _ -> lex_sym cont (incLexeme buf)
551 '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
552 -> lex_cstring cont (setCurrentPos# buf 2#)
554 -> cont ITbackquote (incLexeme buf)
556 '{'# -> -- for Emacs: -}
557 case lookAhead# buf 1# of
558 '|'# | glaExtsEnabled exts
559 -> cont ITocurlybar (setCurrentPos# buf 2#)
560 '-'# -> case lookAhead# buf 2# of
561 '#'# -> lex_prag cont (setCurrentPos# buf 3#)
562 _ -> cont ITocurly (incLexeme buf)
563 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
565 -- strings/characters -------------------------------------------------
566 '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
567 '\''# -> lex_char (char_end cont) exts (incLexeme buf)
569 -- Hexadecimal and octal constants
570 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
571 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
572 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
573 -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec
574 where ch = lookAhead# buf 1#
575 ch2 = lookAhead# buf 2#
576 buf' = setCurrentPos# buf 2#
579 if bufferExhausted (stepOn buf) then
582 trace "lexIface: misplaced NUL?" $
583 cont (ITunknown "\NUL") (stepOn buf)
585 '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
586 lex_ip ITdupipvarid cont (incLexeme buf)
587 '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
588 lex_ip ITsplitipvarid cont (incLexeme buf)
589 c | is_digit c -> lex_num cont exts 0 buf
590 | is_symbol c -> lex_sym cont buf
591 | is_upper c -> lex_con cont exts buf
592 | is_ident c -> lex_id cont exts buf
593 | otherwise -> lexError "illegal character" buf
595 -- Int# is unlifted, and therefore faster than Bool for flags.
601 -------------------------------------------------------------------------------
605 = case expandWhile# is_space buf of { buf1 ->
606 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
607 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
608 case lookupUFM pragmaKeywordsFM lexeme of
609 Just kw -> cont kw (mergeLexemes buf buf2)
610 Nothing -> panic "lex_prag"
613 -------------------------------------------------------------------------------
616 lex_string cont exts s buf
617 = case currentChar# buf of
619 let buf' = incLexeme buf
620 s' = mkFastStringNarrow (map chr (reverse s))
621 in case currentChar# buf' of
622 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
623 then cont (ITprimstring s') (incLexeme buf')
624 else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
625 _ -> cont (ITstring s') buf'
627 -- ignore \& in a string, deal with string gaps
628 '\\'# | next_ch `eqChar#` '&'#
629 -> lex_string cont exts s buf'
631 -> lex_stringgap cont exts s (incLexeme buf)
633 where next_ch = lookAhead# buf 1#
634 buf' = setCurrentPos# buf 2#
636 _ -> lex_char (lex_next_string cont s) exts buf
638 lex_stringgap cont exts s buf
639 = let buf' = incLexeme buf in
640 case currentChar# buf of
641 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
642 st{loc = incSrcLine loc}
643 '\\'# -> lex_string cont exts s buf'
644 c | is_space c -> lex_stringgap cont exts s buf'
645 other -> charError buf'
647 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
649 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
650 lex_char cont exts buf
651 = case currentChar# buf of
652 '\\'# -> lex_escape (cont exts) (incLexeme buf)
653 c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
654 other -> charError buf
656 char_end cont exts c buf
657 = case currentChar# buf of
658 '\''# -> let buf' = incLexeme buf in
659 case currentChar# buf' of
660 '#'# | glaExtsEnabled exts
661 -> cont (ITprimchar c) (incLexeme buf')
662 _ -> cont (ITchar c) buf'
666 = let buf' = incLexeme buf in
667 case currentChar# buf of
668 'a'# -> cont (ord '\a') buf'
669 'b'# -> cont (ord '\b') buf'
670 'f'# -> cont (ord '\f') buf'
671 'n'# -> cont (ord '\n') buf'
672 'r'# -> cont (ord '\r') buf'
673 't'# -> cont (ord '\t') buf'
674 'v'# -> cont (ord '\v') buf'
675 '\\'# -> cont (ord '\\') buf'
676 '"'# -> cont (ord '\"') buf'
677 '\''# -> cont (ord '\'') buf'
678 '^'# -> let c = currentChar# buf' in
679 if c `geChar#` '@'# && c `leChar#` '_'#
680 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
683 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
684 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
686 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
688 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
689 Just buf2 <- [prefixMatch buf p] ] of
690 (c,buf2):_ -> cont (ord c) buf2
693 after_charnum cont i buf
694 = if i >= 0 && i <= 0x10FFFF
695 then cont (fromInteger i) buf
698 readNum cont buf is_digit base conv = read buf 0
700 = case currentChar# buf of { c ->
702 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
708 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
709 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
711 hex c | is_digit c = ord# c -# ord# '0'#
712 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
713 oct_or_dec c = ord# c -# ord# '0'#
715 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
718 | c `geChar#` 'A'# && c `leChar#` 'Z'#
719 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
722 charError buf = lexError "error in character literal" buf
724 silly_escape_chars = [
761 -----------------------------------------------------------------------------
764 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
765 lex_num cont exts acc buf =
766 case scanNumLit acc buf of
768 case currentChar# buf' of
769 '.'# | is_digit (lookAhead# buf' 1#) ->
770 -- this case is not optimised at all, as the
771 -- presence of floating point numbers in interface
772 -- files is not that common. (ToDo)
773 case expandWhile# is_digit (incLexeme buf') of
774 buf2 -> -- points to first non digit char
776 let l = case currentChar# buf2 of
782 = let buf3 = incLexeme buf2 in
783 case currentChar# buf3 of
784 '-'# | is_digit (lookAhead# buf3 1#)
785 -> expandWhile# is_digit (incLexeme buf3)
786 '+'# | is_digit (lookAhead# buf3 1#)
787 -> expandWhile# is_digit (incLexeme buf3)
788 x | is_digit x -> expandWhile# is_digit buf3
791 v = readRational__ (lexemeToString l)
793 in case currentChar# l of -- glasgow exts only
794 '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
795 case currentChar# l' of
796 '#'# -> cont (ITprimdouble v) (incLexeme l')
797 _ -> cont (ITprimfloat v) l'
798 _ -> cont (ITrational v) l
800 _ -> after_lexnum cont exts acc' buf'
802 after_lexnum cont exts i buf
803 = case currentChar# buf of
804 '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
805 _ -> cont (ITinteger i) buf
807 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
811 return ((n%1)*10^^(k-d), t)
814 (ds,s) <- lexDecDigits r
815 (ds',t) <- lexDotDigits s
816 return (read (ds++ds'), length ds', t)
818 readExp (e:s) | e `elem` "eE" = readExp' s
819 readExp s = return (0,s)
821 readExp' ('+':s) = readDec s
822 readExp' ('-':s) = do
825 readExp' s = readDec s
828 (ds,r) <- nonnull isDigit s
829 return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
832 lexDecDigits = nonnull isDigit
834 lexDotDigits ('.':s) = return (span isDigit s)
835 lexDotDigits s = return ("",s)
837 nonnull p s = do (cs@(_:_),t) <- return (span p s)
840 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
843 '-' : xs -> - (read_me xs)
847 = case (do { (x,"") <- readRational s ; return x }) of
849 [] -> error ("readRational__: no parse:" ++ top_s)
850 _ -> error ("readRational__: ambiguous parse:" ++ top_s)
852 -----------------------------------------------------------------------------
853 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
855 -- we lexemeToFastString on the bit between the ``''s, but include the
856 -- quotes in the full lexeme.
858 lex_cstring cont buf =
859 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
860 Just buf' -> cont (ITlitlit (lexemeToFastString
861 (setCurrentPos# buf' (negateInt# 2#))))
862 (mergeLexemes buf buf')
863 Nothing -> lexError "unterminated ``" buf
865 -----------------------------------------------------------------------------
866 -- identifiers, symbols etc.
868 lex_ip ip_constr cont buf =
869 case expandWhile# is_ident buf of
870 buf' -> cont (ip_constr (tailFS lexeme)) buf'
871 where lexeme = lexemeToFastString buf'
873 lex_id cont exts buf =
874 let buf1 = expandWhile# is_ident buf in
877 case (if glaExtsEnabled exts
878 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
879 else buf1) of { buf' ->
882 let lexeme = lexemeToFastString buf' in
884 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
885 Just kwd_token -> --trace ("hkeywd: "++unpackFS(lexeme)) $
889 let var_token = cont (ITvarid lexeme) buf' in
891 case lookupUFM ghcExtensionKeywordsFM lexeme of {
892 Just (kwd_token, validExts)
893 | validExts .&. (I# exts) /= 0 -> cont kwd_token buf';
900 case expandWhile# is_symbol buf of
901 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
902 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
903 cont kwd_token buf' ;
904 Nothing -> --trace ("sym: "++unpackFS lexeme) $
905 cont (mk_var_token lexeme) buf'
907 where lexeme = lexemeToFastString buf'
910 -- lex_con recursively collects components of a qualified identifer.
911 -- The argument buf is the StringBuffer representing the lexeme
912 -- identified so far, where the next character is upper-case.
914 lex_con cont exts buf =
915 -- trace ("con: "{-++unpackFS lexeme-}) $
916 let empty_buf = stepOverLexeme buf in
917 case expandWhile# is_ident empty_buf of { buf1 ->
918 case slurp_trailing_hashes buf1 exts of { con_buf ->
920 let all_buf = mergeLexemes buf con_buf
922 con_lexeme = lexemeToFastString con_buf
923 mod_lexeme = lexemeToFastString (decLexeme buf)
924 all_lexeme = lexemeToFastString all_buf
927 | emptyLexeme buf = cont (ITconid con_lexeme) all_buf
928 | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
931 case currentChar# all_buf of
932 '.'# -> maybe_qualified cont exts all_lexeme
933 (incLexeme all_buf) just_a_conid
938 maybe_qualified cont exts mod buf just_a_conid =
939 -- trace ("qid: "{-++unpackFS lexeme-}) $
940 case currentChar# buf of
941 '['# -> -- Special case for []
942 case lookAhead# buf 1# of
943 ']'# -> cont (ITqconid (mod,FSLIT("[]"))) (setCurrentPos# buf 2#)
946 '('# -> -- Special case for (,,,)
947 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
948 case lookAhead# buf 1# of
949 '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
950 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
953 ')'# -> cont (ITqconid (mod,FSLIT("()"))) (setCurrentPos# buf 2#)
954 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
957 '-'# -> case lookAhead# buf 1# of
958 '>'# -> cont (ITqconid (mod,FSLIT("(->)"))) (setCurrentPos# buf 2#)
959 _ -> lex_id3 cont exts mod buf just_a_conid
961 _ -> lex_id3 cont exts mod buf just_a_conid
964 lex_id3 cont exts mod buf just_a_conid
965 | is_upper (currentChar# buf) =
966 lex_con cont exts buf
968 | is_symbol (currentChar# buf) =
970 start_new_lexeme = stepOverLexeme buf
972 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
973 case expandWhile# is_symbol start_new_lexeme of { buf' ->
975 lexeme = lexemeToFastString buf'
976 -- real lexeme is M.<sym>
977 new_buf = mergeLexemes buf buf'
979 cont (mk_qvar_token mod lexeme) new_buf
980 -- wrong, but arguably morally right: M... is now a qvarsym
985 start_new_lexeme = stepOverLexeme buf
987 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
988 case expandWhile# is_ident start_new_lexeme of { buf1 ->
993 case slurp_trailing_hashes buf1 exts of { buf' ->
996 lexeme = lexemeToFastString buf'
997 new_buf = mergeLexemes buf buf'
998 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1000 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1001 Nothing -> is_a_qvarid ;
1003 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
1004 -> is_a_qvarid -- recognised as keywords here.
1006 -> just_a_conid -- avoid M.where etc.
1009 slurp_trailing_hashes buf exts
1010 | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1015 | is_upper f = ITconid pk_str
1016 | is_ident f = ITvarid pk_str
1017 | f `eqChar#` ':'# = ITconsym pk_str
1018 | otherwise = ITvarsym pk_str
1020 (C# f) = headFS pk_str
1021 -- tl = _TAIL_ pk_str
1023 mk_qvar_token m token =
1024 -- trace ("mk_qvar ") $
1025 case mk_var_token token of
1026 ITconid n -> ITqconid (m,n)
1027 ITvarid n -> ITqvarid (m,n)
1028 ITconsym n -> ITqconsym (m,n)
1029 ITvarsym n -> ITqvarsym (m,n)
1030 _ -> ITunknown (show token)
1033 ----------------------------------------------------------------------------
1034 Horrible stuff for dealing with M.(,,,)
1037 lex_tuple cont mod buf back_off =
1041 case currentChar# buf of
1042 ','# -> go (n+1) (stepOn buf)
1043 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1046 lex_ubx_tuple cont mod buf back_off =
1050 case currentChar# buf of
1051 ','# -> go (n+1) (stepOn buf)
1052 '#'# -> case lookAhead# buf 1# of
1053 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1059 -----------------------------------------------------------------------------
1070 data PState = PState {
1072 extsBitmap :: Int#, -- bitmap that determines permitted extensions
1075 context :: [LayoutContext]
1078 type P a = StringBuffer -- Input string
1083 returnP a buf s = POk s a
1085 thenP :: P a -> (a -> P b) -> P b
1086 m `thenP` k = \ buf s ->
1088 POk s1 a -> k a buf s1
1089 PFailed err -> PFailed err
1091 thenP_ :: P a -> P b -> P b
1092 m `thenP_` k = m `thenP` \_ -> k
1094 mapP :: (a -> P b) -> [a] -> P [b]
1095 mapP f [] = returnP []
1098 mapP f as `thenP` \bs ->
1101 failP :: String -> P a
1102 failP msg buf s = PFailed (text msg)
1104 failMsgP :: Message -> P a
1105 failMsgP msg buf s = PFailed msg
1107 lexError :: String -> P a
1108 lexError str buf s@PState{ loc = loc }
1109 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1111 getSrcLocP :: P SrcLoc
1112 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1114 -- use a temporary SrcLoc for the duration of the argument
1115 setSrcLocP :: SrcLoc -> P a -> P a
1116 setSrcLocP new_loc p buf s =
1117 case p buf s{ loc=new_loc } of
1119 PFailed e -> PFailed e
1121 getSrcFile :: P FastString
1122 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1124 pushContext :: LayoutContext -> P ()
1125 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1129 This special case in layoutOn is to handle layout contexts with are
1130 indented the same or less than the current context. This is illegal
1131 according to the Haskell spec, so we have to arrange to close the
1132 current context. eg.
1137 after the first 'where', the sequence of events is:
1139 - layout system inserts a ';' (column 0)
1140 - parser begins a new context at column 0
1141 - parser shifts ';' (legal empty declaration)
1142 - parser sees 'class': parse error (we're still in the inner context)
1144 trouble is, by the time we know we need a new context, the lexer has
1145 already generated the ';'. Hacky solution is as follows: since we
1146 know the column of the next token (it's the column number of the new
1147 context), we set the ACTUAL column number of the new context to this
1148 numer plus one. Hence the next time the lexer is called, a '}' will
1149 be generated to close the new context straight away. Furthermore, we
1150 have to set the atbol flag so that the ';' that the parser shifted as
1151 part of the new context is re-generated.
1153 when the new context is *less* indented than the current one:
1155 f = f where g = g where
1158 - current context: column 12.
1159 - on seeing 'h' (column 0), the layout system inserts '}'
1160 - parser starts a new context, column 0
1161 - parser sees '}', uses it to close new context
1162 - we still need to insert another '}' followed by a ';',
1163 hence the atbol trick.
1165 There's also a special hack in here to deal with
1172 i.e. the inner context is at the same indentation level as the outer
1173 context. This is strictly illegal according to Haskell 98, but
1174 there's a lot of existing code using this style and it doesn't make
1175 any sense to disallow it, since empty 'do' lists don't make sense.
1178 layoutOn :: Bool -> P ()
1179 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1180 let offset = lexemeIndex buf -# bol in
1183 | if strict then prev_off >=# offset else prev_off ># offset ->
1184 --trace ("layout on, column: " ++ show (I# offset)) $
1185 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1187 --trace ("layout on, column: " ++ show (I# offset)) $
1188 POk s{ context = Layout offset : ctx } ()
1191 layoutOff buf s@(PState{ context = ctx }) =
1192 POk s{ context = NoLayout:ctx } ()
1195 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1197 (_:tl) -> POk s{ context = tl } ()
1198 [] -> PFailed (srcParseErr buf loc)
1200 -- for reasons of efficiency, flags indicating language extensions (eg,
1201 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1204 glaExtsBit, ffiBit, parrBit :: Int
1210 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1211 glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit
1212 ffiEnabled flags = testBit (toInt32 flags) ffiBit
1213 withEnabled flags = testBit (toInt32 flags) withBit
1214 parrEnabled flags = testBit (toInt32 flags) parrBit
1216 toInt32 :: Int# -> Int32
1217 toInt32 x# = fromIntegral (I# x#)
1219 -- convenient record-based bitmap for the interface to the rest of the world
1221 -- NB: `glasgowExtsEF' implies `ffiEF' (see `mkPState' below)
1223 data ExtFlags = ExtFlags {
1224 glasgowExtsEF :: Bool,
1230 -- create a parse state
1232 mkPState :: SrcLoc -> ExtFlags -> PState
1236 extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits},
1242 bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts
1243 .|. ffiBit `setBitIf` (ffiEF exts
1244 || glasgowExtsEF exts)
1245 .|. withBit `setBitIf` withEF exts
1246 .|. parrBit `setBitIf` parrEF exts
1248 setBitIf :: Int -> Bool -> Int32
1249 b `setBitIf` cond | cond = bit b
1252 -----------------------------------------------------------------------------
1254 srcParseErr :: StringBuffer -> SrcLoc -> Message
1258 then ptext SLIT(": parse error (possibly incorrect indentation)")
1259 else hcat [ptext SLIT(": parse error on input "),
1260 char '`', text token, char '\'']
1263 token = lexemeToString s