2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
24 Token(..), lexer, ParseResult(..), PState(..),
25 ExtFlags(..), mkPState,
28 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
29 getSrcLocP, setSrcLocP, getSrcFile,
30 layoutOn, layoutOff, pushContext, popContext
33 #include "HsVersions.h"
35 import Char ( toUpper, isDigit, chr, ord )
38 import PrelNames ( mkTupNameStr )
39 import ForeignCall ( Safety(..) )
40 import UniqFM ( listToUFM, lookupUFM )
41 import BasicTypes ( Boxity(..) )
42 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
43 replaceSrcLine, mkSrcLoc )
45 import ErrUtils ( Message )
53 import Bits ( Bits(..) ) -- non-std
57 %************************************************************************
59 \subsection{Data types}
61 %************************************************************************
63 The token data type, fairly un-interesting except from one
64 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
65 strictness, unfolding etc).
67 The Idea/Observation here is that the renamer needs to scan through
68 all of an interface file before it can continue. But only a fraction
69 of the information contained in the file turns out to be useful, so
70 delaying as much as possible of the scanning and parsing of an
71 interface file Makes Sense (Heap profiles of the compiler
72 show a reduction in heap usage by at least a factor of two,
75 Hence, the interface file lexer spots when value declarations are
76 being scanned and return the @ITidinfo@ and @ITtype@ constructors
77 for the type and any other id info for that binding (unfolding, strictness
78 etc). These constructors are applied to the result of lexing these sub-chunks.
80 The lexing of the type and id info is all done lazily, of course, so
81 the scanning (and subsequent parsing) will be done *only* on the ids the
82 renamer finds out that it is interested in. The rest will just be junked.
83 Laziness, you know it makes sense :-)
87 = ITas -- Haskell keywords
111 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
113 | ITforall -- GHC extension keywords
125 | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc)
127 | ITspecialise_prag -- Pragmas
137 | ITdotdot -- reserved symbols
152 | ITbiglam -- GHC-extension symbols
154 | ITocurly -- special symbols
156 | ITocurlybar -- {|, for type applications
157 | ITccurlybar -- |}, for type applications
160 | ITopabrack -- [:, for parallel arrays with -fparr
161 | ITcpabrack -- :], for parallel arrays with -fparr
172 | ITvarid FAST_STRING -- identifiers
173 | ITconid FAST_STRING
174 | ITvarsym FAST_STRING
175 | ITconsym FAST_STRING
176 | ITqvarid (FAST_STRING,FAST_STRING)
177 | ITqconid (FAST_STRING,FAST_STRING)
178 | ITqvarsym (FAST_STRING,FAST_STRING)
179 | ITqconsym (FAST_STRING,FAST_STRING)
181 | ITdupipvarid FAST_STRING -- GHC extension: implicit param: ?x
182 | ITsplitipvarid FAST_STRING -- GHC extension: implicit param: %x
184 | ITpragma StringBuffer
187 | ITstring FAST_STRING
189 | ITrational Rational
192 | ITprimstring FAST_STRING
194 | ITprimfloat Rational
195 | ITprimdouble Rational
196 | ITlitlit FAST_STRING
198 | ITunknown String -- Used when the lexer can't make sense of it
199 | ITeof -- end of file token
200 deriving Show -- debugging
203 -----------------------------------------------------------------------------
207 pragmaKeywordsFM = listToUFM $
208 map (\ (x,y) -> (_PK_ x,y))
209 [( "SPECIALISE", ITspecialise_prag ),
210 ( "SPECIALIZE", ITspecialise_prag ),
211 ( "SOURCE", ITsource_prag ),
212 ( "INLINE", ITinline_prag ),
213 ( "NOINLINE", ITnoinline_prag ),
214 ( "NOTINLINE", ITnoinline_prag ),
215 ( "LINE", ITline_prag ),
216 ( "RULES", ITrules_prag ),
217 ( "RULEZ", ITrules_prag ), -- american spelling :-)
218 ( "SCC", ITscc_prag ),
219 ( "DEPRECATED", ITdeprecated_prag )
222 haskellKeywordsFM = listToUFM $
223 map (\ (x,y) -> (_PK_ x,y))
224 [( "_", ITunderscore ),
227 ( "class", ITclass ),
229 ( "default", ITdefault ),
230 ( "deriving", ITderiving ),
233 ( "hiding", IThiding ),
235 ( "import", ITimport ),
237 ( "infix", ITinfix ),
238 ( "infixl", ITinfixl ),
239 ( "infixr", ITinfixr ),
240 ( "instance", ITinstance ),
242 ( "module", ITmodule ),
243 ( "newtype", ITnewtype ),
245 ( "qualified", ITqualified ),
248 ( "where", ITwhere ),
249 ( "_scc_", ITscc ) -- ToDo: remove
252 isSpecial :: Token -> Bool
253 -- If we see M.x, where x is a keyword, but
254 -- is special, we treat is as just plain M.x,
256 isSpecial ITas = True
257 isSpecial IThiding = True
258 isSpecial ITqualified = True
259 isSpecial ITforall = True
260 isSpecial ITexport = True
261 isSpecial ITlabel = True
262 isSpecial ITdynamic = True
263 isSpecial ITsafe = True
264 isSpecial ITthreadsafe = True
265 isSpecial ITunsafe = True
266 isSpecial ITwith = True
267 isSpecial ITccallconv = True
268 isSpecial ITstdcallconv = True
271 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
272 ghcExtensionKeywordsFM = listToUFM $
273 map (\ (x,y) -> (_PK_ x,y))
274 [ ( "forall", ITforall ),
275 ( "foreign", ITforeign ),
276 ( "export", ITexport ),
277 ( "label", ITlabel ),
278 ( "dynamic", ITdynamic ),
280 ( "threadsafe", ITthreadsafe ),
281 ( "unsafe", ITunsafe ),
283 ( "stdcall", ITstdcallconv),
284 ( "ccall", ITccallconv),
285 ( "dotnet", ITdotnet),
286 ("_ccall_", ITccall (False, False, PlayRisky)),
287 ("_ccall_GC_", ITccall (False, False, PlaySafe False)),
288 ("_casm_", ITccall (False, True, PlayRisky)),
289 ("_casm_GC_", ITccall (False, True, PlaySafe False))
293 haskellKeySymsFM = listToUFM $
294 map (\ (x,y) -> (_PK_ x,y))
308 ,(".", ITdot) -- sadly, for 'forall a . t'
312 -----------------------------------------------------------------------------
317 - (exts) lexing a source with extensions, eg, an interface file or
319 - (bol) pointer to beginning of line (for column calculations)
320 - (buf) pointer to beginning of token
321 - (buf) pointer to current char
322 - (atbol) flag indicating whether we're at the beginning of a line
325 lexer :: (Token -> P a) -> P a
326 lexer cont buf s@(PState{
334 -- first, start a new lexeme and lose all the whitespace
336 tab line bol atbol (stepOverLexeme buf)
338 line = srcLocLine loc
340 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
341 case currentChar# buf of
344 if bufferExhausted (stepOn buf)
345 then cont ITeof buf s'
346 else trace "lexer: misplaced NUL?" $
347 tab y bol atbol (stepOn buf)
349 '\n'# -> let buf' = stepOn buf
350 in tab (y +# 1#) (currentIndex# buf') 1# buf'
352 -- find comments. This got harder in Haskell 98.
353 '-'# -> let trundle n =
354 let next = lookAhead# buf n in
355 if next `eqChar#` '-'# then trundle (n +# 1#)
356 else if is_symbol next || n <# 2#
359 (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
362 -- comments and pragmas. We deal with LINE pragmas here,
363 -- and throw out any unrecognised pragmas as comments. Any
364 -- pragmas we know about are dealt with later (after any layout
365 -- processing if necessary).
366 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
367 if lookAhead# buf 2# `eqChar#` '#'# then
368 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
369 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
370 let lexeme = mkFastString -- ToDo: too slow
371 (map toUpper (lexemeToString buf2)) in
372 case lookupUFM pragmaKeywordsFM lexeme of
373 -- ignore RULES pragmas when -fglasgow-exts is off
374 Just ITrules_prag | not (glaExtsEnabled exts) ->
375 skip_to_end (stepOnBy# buf 2#) s'
377 line_prag skip_to_end buf2 s'
378 Just other -> is_a_token
379 Nothing -> skip_to_end (stepOnBy# buf 2#) s'
382 else skip_to_end (stepOnBy# buf 2#) s'
384 skip_to_end = skipNestedComment (lexer cont)
386 -- special GHC extension: we grok cpp-style #line pragmas
387 '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0
388 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
389 lookAhead# buf 2# `eqChar#` 'i'# &&
390 lookAhead# buf 3# `eqChar#` 'n'# &&
391 lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5#
392 | otherwise = stepOn buf
394 case expandWhile# is_space buf1 of { buf2 ->
395 if is_digit (currentChar# buf2)
396 then line_prag next_line buf2 s'
400 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
402 -- tabs have been expanded beforehand
403 c | is_space c -> tab y bol atbol (stepOn buf)
404 | otherwise -> is_a_token
406 where s' = s{loc = replaceSrcLine loc y,
410 is_a_token | atbol /=# 0# = lexBOL cont buf s'
411 | otherwise = lexToken cont exts buf s'
413 -- {-# LINE .. #-} pragmas. yeuch.
414 line_prag cont buf s@PState{loc=loc} =
415 case expandWhile# is_space buf of { buf1 ->
416 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
417 -- subtract one: the line number refers to the *following* line.
418 let real_line = line - 1 in
419 case fromInteger real_line of { i@(I# l) ->
420 -- ToDo, if no filename then we skip the newline.... d'oh
421 case expandWhile# is_space buf2 of { buf3 ->
422 case currentChar# buf3 of
424 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
426 file = lexemeToFastString buf4
427 new_buf = stepOn (stepOverLexeme buf4)
429 if nullFastString file
430 then cont new_buf s{loc = replaceSrcLine loc l}
431 else cont new_buf s{loc = mkSrcLoc file i}
433 _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
436 skipNestedComment :: P a -> P a
437 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
439 skipNestedComment' :: SrcLoc -> P a -> P a
440 skipNestedComment' orig_loc cont buf = loop buf
443 case currentChar# buf of
444 '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
446 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
448 (skipNestedComment' orig_loc cont)
451 '\n'# -> \ s@PState{loc=loc} ->
452 let buf' = stepOn buf in
453 loop buf' s{loc = incSrcLine loc,
454 bol = currentIndex# buf',
457 -- pass the original SrcLoc to lexError so that the error is
458 -- reported at the line it was originally on, not the line at
459 -- the end of the file.
460 '\NUL'# | bufferExhausted (stepOn buf) ->
461 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
463 _ -> loop (stepOn buf)
465 -- When we are lexing the first token of a line, check whether we need to
466 -- insert virtual semicolons or close braces due to layout.
468 lexBOL :: (Token -> P a) -> P a
469 lexBOL cont buf s@(PState{
476 if need_close_curly then
477 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
478 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
479 else if need_semi_colon then
480 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
481 cont ITsemi buf s{atbol = 0#}
483 lexToken cont exts buf s{atbol = 0#}
485 col = currentIndex# buf -# bol
498 Layout n -> col ==# n
501 lexToken :: (Token -> P a) -> Int# -> P a
502 lexToken cont exts buf =
503 -- trace "lexToken" $
504 case currentChar# buf of
506 -- special symbols ----------------------------------------------------
507 '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
508 -> cont IToubxparen (setCurrentPos# buf 2#)
510 -> cont IToparen (incLexeme buf)
512 ')'# -> cont ITcparen (incLexeme buf)
513 '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
514 cont ITopabrack (setCurrentPos# buf 2#)
516 cont ITobrack (incLexeme buf)
517 ']'# -> cont ITcbrack (incLexeme buf)
518 ','# -> cont ITcomma (incLexeme buf)
519 ';'# -> cont ITsemi (incLexeme buf)
520 '}'# -> \ s@PState{context = ctx} ->
522 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
523 _ -> lexError "too many '}'s" buf s
524 '|'# -> case lookAhead# buf 1# of
525 '}'# | glaExtsEnabled exts -> cont ITccurlybar
526 (setCurrentPos# buf 2#)
527 _ -> lex_sym cont (incLexeme buf)
528 ':'# -> case lookAhead# buf 1# of
529 ']'# | parrEnabled exts -> cont ITcpabrack
530 (setCurrentPos# buf 2#)
531 _ -> lex_sym cont (incLexeme buf)
534 '#'# -> case lookAhead# buf 1# of
535 ')'# | glaExtsEnabled exts
536 -> cont ITcubxparen (setCurrentPos# buf 2#)
537 '-'# -> case lookAhead# buf 2# of
538 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
539 _ -> lex_sym cont (incLexeme buf)
540 _ -> lex_sym cont (incLexeme buf)
542 '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
543 -> lex_cstring cont (setCurrentPos# buf 2#)
545 -> cont ITbackquote (incLexeme buf)
547 '{'# -> -- for Emacs: -}
548 case lookAhead# buf 1# of
549 '|'# | glaExtsEnabled exts
550 -> cont ITocurlybar (setCurrentPos# buf 2#)
551 '-'# -> case lookAhead# buf 2# of
552 '#'# -> lex_prag cont (setCurrentPos# buf 3#)
553 _ -> cont ITocurly (incLexeme buf)
554 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
556 -- strings/characters -------------------------------------------------
557 '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
558 '\''# -> lex_char (char_end cont) exts (incLexeme buf)
560 -- Hexadecimal and octal constants
561 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
562 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
563 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
564 -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec
565 where ch = lookAhead# buf 1#
566 ch2 = lookAhead# buf 2#
567 buf' = setCurrentPos# buf 2#
570 if bufferExhausted (stepOn buf) then
573 trace "lexIface: misplaced NUL?" $
574 cont (ITunknown "\NUL") (stepOn buf)
576 '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
577 lex_ip ITdupipvarid cont (incLexeme buf)
578 '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
579 lex_ip ITsplitipvarid cont (incLexeme buf)
580 c | is_digit c -> lex_num cont exts 0 buf
581 | is_symbol c -> lex_sym cont buf
582 | is_upper c -> lex_con cont exts buf
583 | is_ident c -> lex_id cont exts buf
584 | otherwise -> lexError "illegal character" buf
586 -- Int# is unlifted, and therefore faster than Bool for flags.
592 -------------------------------------------------------------------------------
596 = case expandWhile# is_space buf of { buf1 ->
597 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
598 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
599 case lookupUFM pragmaKeywordsFM lexeme of
600 Just kw -> cont kw (mergeLexemes buf buf2)
601 Nothing -> panic "lex_prag"
604 -------------------------------------------------------------------------------
607 lex_string cont exts s buf
608 = case currentChar# buf of
610 let buf' = incLexeme buf
611 s' = mkFastStringNarrow (map chr (reverse s))
612 in case currentChar# buf' of
613 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
614 then cont (ITprimstring s') (incLexeme buf')
615 else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
616 _ -> cont (ITstring s') buf'
618 -- ignore \& in a string, deal with string gaps
619 '\\'# | next_ch `eqChar#` '&'#
620 -> lex_string cont exts s buf'
622 -> lex_stringgap cont exts s (incLexeme buf)
624 where next_ch = lookAhead# buf 1#
625 buf' = setCurrentPos# buf 2#
627 _ -> lex_char (lex_next_string cont s) exts buf
629 lex_stringgap cont exts s buf
630 = let buf' = incLexeme buf in
631 case currentChar# buf of
632 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
633 st{loc = incSrcLine loc}
634 '\\'# -> lex_string cont exts s buf'
635 c | is_space c -> lex_stringgap cont exts s buf'
636 other -> charError buf'
638 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
640 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
641 lex_char cont exts buf
642 = case currentChar# buf of
643 '\\'# -> lex_escape (cont exts) (incLexeme buf)
644 c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
645 other -> charError buf
647 char_end cont exts c buf
648 = case currentChar# buf of
649 '\''# -> let buf' = incLexeme buf in
650 case currentChar# buf' of
651 '#'# | glaExtsEnabled exts
652 -> cont (ITprimchar c) (incLexeme buf')
653 _ -> cont (ITchar c) buf'
657 = let buf' = incLexeme buf in
658 case currentChar# buf of
659 'a'# -> cont (ord '\a') buf'
660 'b'# -> cont (ord '\b') buf'
661 'f'# -> cont (ord '\f') buf'
662 'n'# -> cont (ord '\n') buf'
663 'r'# -> cont (ord '\r') buf'
664 't'# -> cont (ord '\t') buf'
665 'v'# -> cont (ord '\v') buf'
666 '\\'# -> cont (ord '\\') buf'
667 '"'# -> cont (ord '\"') buf'
668 '\''# -> cont (ord '\'') buf'
669 '^'# -> let c = currentChar# buf' in
670 if c `geChar#` '@'# && c `leChar#` '_'#
671 then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
674 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
675 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
677 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
679 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
680 Just buf2 <- [prefixMatch buf p] ] of
681 (c,buf2):_ -> cont (ord c) buf2
684 after_charnum cont i buf
685 = if i >= 0 && i <= 0x10FFFF
686 then cont (fromInteger i) buf
689 readNum cont buf is_digit base conv = read buf 0
691 = case currentChar# buf of { c ->
693 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
699 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
700 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
702 hex c | is_digit c = ord# c -# ord# '0'#
703 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
704 oct_or_dec c = ord# c -# ord# '0'#
706 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
709 | c `geChar#` 'A'# && c `leChar#` 'Z'#
710 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
713 charError buf = lexError "error in character literal" buf
715 silly_escape_chars = [
752 -----------------------------------------------------------------------------
755 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
756 lex_num cont exts acc buf =
757 case scanNumLit acc buf of
759 case currentChar# buf' of
760 '.'# | is_digit (lookAhead# buf' 1#) ->
761 -- this case is not optimised at all, as the
762 -- presence of floating point numbers in interface
763 -- files is not that common. (ToDo)
764 case expandWhile# is_digit (incLexeme buf') of
765 buf2 -> -- points to first non digit char
767 let l = case currentChar# buf2 of
773 = let buf3 = incLexeme buf2 in
774 case currentChar# buf3 of
775 '-'# | is_digit (lookAhead# buf3 1#)
776 -> expandWhile# is_digit (incLexeme buf3)
777 '+'# | is_digit (lookAhead# buf3 1#)
778 -> expandWhile# is_digit (incLexeme buf3)
779 x | is_digit x -> expandWhile# is_digit buf3
782 v = readRational__ (lexemeToString l)
784 in case currentChar# l of -- glasgow exts only
785 '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
786 case currentChar# l' of
787 '#'# -> cont (ITprimdouble v) (incLexeme l')
788 _ -> cont (ITprimfloat v) l'
789 _ -> cont (ITrational v) l
791 _ -> after_lexnum cont exts acc' buf'
793 after_lexnum cont exts i buf
794 = case currentChar# buf of
795 '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
796 _ -> cont (ITinteger i) buf
798 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
802 return ((n%1)*10^^(k-d), t)
805 (ds,s) <- lexDecDigits r
806 (ds',t) <- lexDotDigits s
807 return (read (ds++ds'), length ds', t)
809 readExp (e:s) | e `elem` "eE" = readExp' s
810 readExp s = return (0,s)
812 readExp' ('+':s) = readDec s
813 readExp' ('-':s) = do
816 readExp' s = readDec s
819 (ds,r) <- nonnull isDigit s
820 return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
823 lexDecDigits = nonnull isDigit
825 lexDotDigits ('.':s) = return (span isDigit s)
826 lexDotDigits s = return ("",s)
828 nonnull p s = do (cs@(_:_),t) <- return (span p s)
831 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
834 '-' : xs -> - (read_me xs)
838 = case (do { (x,"") <- readRational s ; return x }) of
840 [] -> error ("readRational__: no parse:" ++ top_s)
841 _ -> error ("readRational__: ambiguous parse:" ++ top_s)
843 -----------------------------------------------------------------------------
844 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
846 -- we lexemeToFastString on the bit between the ``''s, but include the
847 -- quotes in the full lexeme.
849 lex_cstring cont buf =
850 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
851 Just buf' -> cont (ITlitlit (lexemeToFastString
852 (setCurrentPos# buf' (negateInt# 2#))))
853 (mergeLexemes buf buf')
854 Nothing -> lexError "unterminated ``" buf
856 -----------------------------------------------------------------------------
857 -- identifiers, symbols etc.
859 lex_ip ip_constr cont buf =
860 case expandWhile# is_ident buf of
861 buf' -> cont (ip_constr (tailFS lexeme)) buf'
862 where lexeme = lexemeToFastString buf'
864 lex_id cont exts buf =
865 let buf1 = expandWhile# is_ident buf in
868 case (if glaExtsEnabled exts
869 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
870 else buf1) of { buf' ->
873 let lexeme = lexemeToFastString buf' in
875 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
876 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
880 let var_token = cont (ITvarid lexeme) buf' in
882 if not (glaExtsEnabled exts)
886 case lookupUFM ghcExtensionKeywordsFM lexeme of {
887 Just kwd_token -> cont kwd_token buf';
894 case expandWhile# is_symbol buf of
895 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
896 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
897 cont kwd_token buf' ;
898 Nothing -> --trace ("sym: "++unpackFS lexeme) $
899 cont (mk_var_token lexeme) buf'
901 where lexeme = lexemeToFastString buf'
904 -- lex_con recursively collects components of a qualified identifer.
905 -- The argument buf is the StringBuffer representing the lexeme
906 -- identified so far, where the next character is upper-case.
908 lex_con cont exts buf =
909 -- trace ("con: "{-++unpackFS lexeme-}) $
910 let empty_buf = stepOverLexeme buf in
911 case expandWhile# is_ident empty_buf of { buf1 ->
912 case slurp_trailing_hashes buf1 exts of { con_buf ->
914 let all_buf = mergeLexemes buf con_buf
916 con_lexeme = lexemeToFastString con_buf
917 mod_lexeme = lexemeToFastString (decLexeme buf)
918 all_lexeme = lexemeToFastString all_buf
921 | emptyLexeme buf = cont (ITconid con_lexeme) all_buf
922 | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
925 case currentChar# all_buf of
926 '.'# -> maybe_qualified cont exts all_lexeme
927 (incLexeme all_buf) just_a_conid
932 maybe_qualified cont exts mod buf just_a_conid =
933 -- trace ("qid: "{-++unpackFS lexeme-}) $
934 case currentChar# buf of
935 '['# -> -- Special case for []
936 case lookAhead# buf 1# of
937 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
940 '('# -> -- Special case for (,,,)
941 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
942 case lookAhead# buf 1# of
943 '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
944 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
947 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
948 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
951 '-'# -> case lookAhead# buf 1# of
952 '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
953 _ -> lex_id3 cont exts mod buf just_a_conid
955 _ -> lex_id3 cont exts mod buf just_a_conid
958 lex_id3 cont exts mod buf just_a_conid
959 | is_upper (currentChar# buf) =
960 lex_con cont exts buf
962 | is_symbol (currentChar# buf) =
964 start_new_lexeme = stepOverLexeme buf
966 -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
967 case expandWhile# is_symbol start_new_lexeme of { buf' ->
969 lexeme = lexemeToFastString buf'
970 -- real lexeme is M.<sym>
971 new_buf = mergeLexemes buf buf'
973 cont (mk_qvar_token mod lexeme) new_buf
974 -- wrong, but arguably morally right: M... is now a qvarsym
979 start_new_lexeme = stepOverLexeme buf
981 -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
982 case expandWhile# is_ident start_new_lexeme of { buf1 ->
987 case slurp_trailing_hashes buf1 exts of { buf' ->
990 lexeme = lexemeToFastString buf'
991 new_buf = mergeLexemes buf buf'
992 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
994 case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
995 Nothing -> is_a_qvarid ;
997 Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be
998 -> is_a_qvarid -- recognised as keywords here.
1000 -> just_a_conid -- avoid M.where etc.
1003 slurp_trailing_hashes buf exts
1004 | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1009 | is_upper f = ITconid pk_str
1010 | is_ident f = ITvarid pk_str
1011 | f `eqChar#` ':'# = ITconsym pk_str
1012 | otherwise = ITvarsym pk_str
1014 (C# f) = _HEAD_ pk_str
1015 -- tl = _TAIL_ pk_str
1017 mk_qvar_token m token =
1018 -- trace ("mk_qvar ") $
1019 case mk_var_token token of
1020 ITconid n -> ITqconid (m,n)
1021 ITvarid n -> ITqvarid (m,n)
1022 ITconsym n -> ITqconsym (m,n)
1023 ITvarsym n -> ITqvarsym (m,n)
1024 _ -> ITunknown (show token)
1027 ----------------------------------------------------------------------------
1028 Horrible stuff for dealing with M.(,,,)
1031 lex_tuple cont mod buf back_off =
1035 case currentChar# buf of
1036 ','# -> go (n+1) (stepOn buf)
1037 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1040 lex_ubx_tuple cont mod buf back_off =
1044 case currentChar# buf of
1045 ','# -> go (n+1) (stepOn buf)
1046 '#'# -> case lookAhead# buf 1# of
1047 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1053 -----------------------------------------------------------------------------
1064 data PState = PState {
1066 extsBitmap :: Int#, -- bitmap that determines permitted extensions
1069 context :: [LayoutContext]
1072 type P a = StringBuffer -- Input string
1077 returnP a buf s = POk s a
1079 thenP :: P a -> (a -> P b) -> P b
1080 m `thenP` k = \ buf s ->
1082 POk s1 a -> k a buf s1
1083 PFailed err -> PFailed err
1085 thenP_ :: P a -> P b -> P b
1086 m `thenP_` k = m `thenP` \_ -> k
1088 mapP :: (a -> P b) -> [a] -> P [b]
1089 mapP f [] = returnP []
1092 mapP f as `thenP` \bs ->
1095 failP :: String -> P a
1096 failP msg buf s = PFailed (text msg)
1098 failMsgP :: Message -> P a
1099 failMsgP msg buf s = PFailed msg
1101 lexError :: String -> P a
1102 lexError str buf s@PState{ loc = loc }
1103 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1105 getSrcLocP :: P SrcLoc
1106 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1108 -- use a temporary SrcLoc for the duration of the argument
1109 setSrcLocP :: SrcLoc -> P a -> P a
1110 setSrcLocP new_loc p buf s =
1111 case p buf s{ loc=new_loc } of
1113 PFailed e -> PFailed e
1115 getSrcFile :: P FAST_STRING
1116 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1118 pushContext :: LayoutContext -> P ()
1119 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1123 This special case in layoutOn is to handle layout contexts with are
1124 indented the same or less than the current context. This is illegal
1125 according to the Haskell spec, so we have to arrange to close the
1126 current context. eg.
1131 after the first 'where', the sequence of events is:
1133 - layout system inserts a ';' (column 0)
1134 - parser begins a new context at column 0
1135 - parser shifts ';' (legal empty declaration)
1136 - parser sees 'class': parse error (we're still in the inner context)
1138 trouble is, by the time we know we need a new context, the lexer has
1139 already generated the ';'. Hacky solution is as follows: since we
1140 know the column of the next token (it's the column number of the new
1141 context), we set the ACTUAL column number of the new context to this
1142 numer plus one. Hence the next time the lexer is called, a '}' will
1143 be generated to close the new context straight away. Furthermore, we
1144 have to set the atbol flag so that the ';' that the parser shifted as
1145 part of the new context is re-generated.
1147 when the new context is *less* indented than the current one:
1149 f = f where g = g where
1152 - current context: column 12.
1153 - on seeing 'h' (column 0), the layout system inserts '}'
1154 - parser starts a new context, column 0
1155 - parser sees '}', uses it to close new context
1156 - we still need to insert another '}' followed by a ';',
1157 hence the atbol trick.
1159 There's also a special hack in here to deal with
1166 i.e. the inner context is at the same indentation level as the outer
1167 context. This is strictly illegal according to Haskell 98, but
1168 there's a lot of existing code using this style and it doesn't make
1169 any sense to disallow it, since empty 'do' lists don't make sense.
1172 layoutOn :: Bool -> P ()
1173 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1174 let offset = lexemeIndex buf -# bol in
1177 | if strict then prev_off >=# offset else prev_off ># offset ->
1178 --trace ("layout on, column: " ++ show (I# offset)) $
1179 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1181 --trace ("layout on, column: " ++ show (I# offset)) $
1182 POk s{ context = Layout offset : ctx } ()
1185 layoutOff buf s@(PState{ context = ctx }) =
1186 POk s{ context = NoLayout:ctx } ()
1189 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1191 (_:tl) -> POk s{ context = tl } ()
1192 [] -> PFailed (srcParseErr buf loc)
1194 -- for reasons of efficiency, flags indicating language extensions (eg,
1195 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1198 glaExtsBit, ffiBit, parrBit :: Int
1200 ffiBit = 1 -- FIXME: not used yet; still part of `glaExtsBit'
1203 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1204 glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit
1205 ffiEnabled flags = testBit (toInt32 flags) ffiBit
1206 parrEnabled flags = testBit (toInt32 flags) parrBit
1208 toInt32 :: Int# -> Int32
1209 toInt32 x# = fromIntegral (I# x#)
1211 -- convenient record-based bitmap for the interface to the rest of the world
1213 data ExtFlags = ExtFlags {
1214 glasgowExtsEF :: Bool,
1215 -- ffiEF :: Bool, -- commented out to avoid warnings
1216 parrEF :: Bool -- while not used yet
1219 -- create a parse state
1221 mkPState :: SrcLoc -> ExtFlags -> PState
1222 mkPState loc exts = PState {
1224 extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits},
1230 bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts
1231 -- .|. ffiBit `setBitIf` ffiEF exts
1232 .|. parrBit `setBitIf` parrEF exts
1234 setBitIf :: Int -> Bool -> Int32
1235 b `setBitIf` cond | cond = bit b
1238 -----------------------------------------------------------------------------
1240 srcParseErr :: StringBuffer -> SrcLoc -> Message
1244 then ptext SLIT(": parse error (possibly incorrect indentation)")
1245 else hcat [ptext SLIT(": parse error on input "),
1246 char '`', text token, char '\'']
1249 token = lexemeToString s