2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 Token(..), lexer, ParseResult(..), PState(..),
30 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
31 getSrcLocP, getSrcFile,
32 layoutOn, layoutOff, pushContext, popContext
35 #include "HsVersions.h"
37 import Char ( ord, isSpace, toUpper )
38 import List ( isSuffixOf )
40 import IdInfo ( InlinePragInfo(..), CprInfo(..) )
41 import Name ( isLowerISO, isUpperISO )
42 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
43 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
44 import Demand ( Demand(..) {- instance Read -} )
45 import UniqFM ( UniqFM, listToUFM, lookupUFM)
46 import BasicTypes ( NewOrData(..) )
47 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
48 replaceSrcLine, mkSrcLoc )
50 import Maybes ( MaybeErr(..) )
51 import ErrUtils ( Message )
59 #if __GLASGOW_HASKELL__ >= 303
66 import PrelRead ( readRational__ ) -- Glasgow non-std
69 %************************************************************************
71 \subsection{Data types}
73 %************************************************************************
75 The token data type, fairly un-interesting except from one
76 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
77 strictness, unfolding etc).
79 The Idea/Observation here is that the renamer needs to scan through
80 all of an interface file before it can continue. But only a fraction
81 of the information contained in the file turns out to be useful, so
82 delaying as much as possible of the scanning and parsing of an
83 interface file Makes Sense (Heap profiles of the compiler
84 show a reduction in heap usage by at least a factor of two,
87 Hence, the interface file lexer spots when value declarations are
88 being scanned and return the @ITidinfo@ and @ITtype@ constructors
89 for the type and any other id info for that binding (unfolding, strictness
90 etc). These constructors are applied to the result of lexing these sub-chunks.
92 The lexing of the type and id info is all done lazily, of course, so
93 the scanning (and subsequent parsing) will be done *only* on the ids the
94 renamer finds out that it is interested in. The rest will just be junked.
95 Laziness, you know it makes sense :-)
99 = ITas -- Haskell keywords
125 | ITforall -- GHC extension keywords
135 | ITinterface -- interface keywords
143 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
158 | ITunfold InlinePragInfo
159 | ITstrict ([Demand], Bool)
162 | ITcprinfo (CprInfo)
166 | ITspecialise_prag -- Pragmas
175 | ITdotdot -- reserved symbols
189 | ITbiglam -- GHC-extension symbols
191 | ITocurly -- special symbols
205 | ITvarid FAST_STRING -- identifiers
206 | ITconid FAST_STRING
207 | ITvarsym FAST_STRING
208 | ITconsym FAST_STRING
209 | ITqvarid (FAST_STRING,FAST_STRING)
210 | ITqconid (FAST_STRING,FAST_STRING)
211 | ITqvarsym (FAST_STRING,FAST_STRING)
212 | ITqconsym (FAST_STRING,FAST_STRING)
214 | ITipvarid FAST_STRING -- GHC extension: implicit param: ?x
216 | ITpragma StringBuffer
219 | ITstring FAST_STRING
221 | ITrational Rational
224 | ITprimstring FAST_STRING
226 | ITprimfloat Rational
227 | ITprimdouble Rational
228 | ITlitlit FAST_STRING
230 | ITunknown String -- Used when the lexer can't make sense of it
231 | ITeof -- end of file token
232 deriving Text -- debugging
235 -----------------------------------------------------------------------------
239 pragmaKeywordsFM = listToUFM $
240 map (\ (x,y) -> (_PK_ x,y))
241 [( "SPECIALISE", ITspecialise_prag ),
242 ( "SPECIALIZE", ITspecialise_prag ),
243 ( "SOURCE", ITsource_prag ),
244 ( "INLINE", ITinline_prag ),
245 ( "NOINLINE", ITnoinline_prag ),
246 ( "NOTINLINE", ITnoinline_prag ),
247 ( "LINE", ITline_prag ),
248 ( "RULES", ITrules_prag ),
249 ( "RULEZ", ITrules_prag ), -- american spelling :-)
250 ( "DEPRECATED", ITdeprecated_prag )
253 haskellKeywordsFM = listToUFM $
254 map (\ (x,y) -> (_PK_ x,y))
255 [( "_", ITunderscore ),
258 ( "class", ITclass ),
260 ( "default", ITdefault ),
261 ( "deriving", ITderiving ),
264 ( "hiding", IThiding ),
266 ( "import", ITimport ),
268 ( "infix", ITinfix ),
269 ( "infixl", ITinfixl ),
270 ( "infixr", ITinfixr ),
271 ( "instance", ITinstance ),
273 ( "module", ITmodule ),
274 ( "newtype", ITnewtype ),
276 ( "qualified", ITqualified ),
279 ( "where", ITwhere ),
283 ghcExtensionKeywordsFM = listToUFM $
284 map (\ (x,y) -> (_PK_ x,y))
285 [ ( "forall", ITforall ),
286 ( "foreign", ITforeign ),
287 ( "export", ITexport ),
288 ( "label", ITlabel ),
289 ( "dynamic", ITdynamic ),
290 ( "unsafe", ITunsafe ),
292 ( "stdcall", ITstdcallconv),
293 ( "ccall", ITccallconv),
294 ("_ccall_", ITccall (False, False, False)),
295 ("_ccall_GC_", ITccall (False, False, True)),
296 ("_casm_", ITccall (False, True, False)),
297 ("_casm_GC_", ITccall (False, True, True)),
299 -- interface keywords
300 ("__interface", ITinterface),
301 ("__export", IT__export),
302 ("__depends", ITdepends),
303 ("__forall", IT__forall),
304 ("__letrec", ITletrec),
305 ("__coerce", ITcoerce),
306 ("__inline_me", ITinlineMe),
307 ("__inline_call", ITinlineCall),
308 ("__depends", ITdepends),
309 ("__DEFAULT", ITdefaultbranch),
311 ("__integer", ITinteger_lit),
312 ("__float", ITfloat_lit),
313 ("__rational", ITrational_lit),
314 ("__addr", ITaddr_lit),
315 ("__litlit", ITlit_lit),
316 ("__string", ITstring_lit),
319 ("__fuall", ITfuall),
321 ("__P", ITspecialise),
324 ("__D", ITdeprecated),
325 ("__U", ITunfold NoInlinePragInfo),
327 ("__ccall", ITccall (False, False, False)),
328 ("__ccall_GC", ITccall (False, False, True)),
329 ("__dyn_ccall", ITccall (True, False, False)),
330 ("__dyn_ccall_GC", ITccall (True, False, True)),
331 ("__casm", ITccall (False, True, False)),
332 ("__dyn_casm", ITccall (True, True, False)),
333 ("__casm_GC", ITccall (False, True, True)),
334 ("__dyn_casm_GC", ITccall (True, True, True)),
340 haskellKeySymsFM = listToUFM $
341 map (\ (x,y) -> (_PK_ x,y))
354 ,(".", ITdot) -- sadly, for 'forall a . t'
358 -----------------------------------------------------------------------------
363 - (glaexts) lexing an interface file or -fglasgow-exts
364 - (bol) pointer to beginning of line (for column calculations)
365 - (buf) pointer to beginning of token
366 - (buf) pointer to current char
367 - (atbol) flag indicating whether we're at the beginning of a line
370 lexer :: (Token -> P a) -> P a
371 lexer cont buf s@(PState{
373 glasgow_exts = glaexts,
379 -- first, start a new lexeme and lose all the whitespace
381 tab line bol atbol (stepOverLexeme buf)
383 line = srcLocLine loc
385 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
386 case currentChar# buf of
389 if bufferExhausted (stepOn buf)
390 then cont ITeof buf s'
391 else trace "lexer: misplaced NUL?" $
392 tab y bol atbol (stepOn buf)
394 '\n'# -> let buf' = stepOn buf
395 in tab (y +# 1#) (currentIndex# buf') 1# buf'
397 -- find comments. This got harder in Haskell 98.
398 '-'# -> let trundle n =
399 let next = lookAhead# buf n in
400 if next `eqChar#` '-'# then trundle (n +# 1#)
401 else if is_symbol next || n <# 2#
403 else case untilChar# (stepOnBy# buf n) '\n'# of
404 { buf' -> tab y bol atbol (stepOverLexeme buf')
408 -- comments and pragmas. We deal with LINE pragmas here,
409 -- and throw out any unrecognised pragmas as comments. Any
410 -- pragmas we know about are dealt with later (after any layout
411 -- processing if necessary).
413 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
414 if lookAhead# buf 2# `eqChar#` '#'# then
415 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
416 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
417 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
418 let lexeme = mkFastString -- ToDo: too slow
419 (map toUpper (lexemeToString buf2)) in
420 case lookupUFM pragmaKeywordsFM lexeme of
421 Just ITline_prag -> line_prag (lexer cont) buf2 s'
422 Just other -> is_a_token
423 Nothing -> skip_to_end (stepOnBy# buf 2#)
426 else skip_to_end (stepOnBy# buf 2#)
428 skip_to_end buf = nested_comment (lexer cont) buf s'
430 -- tabs have been expanded beforehand
431 c | is_space c -> tab y bol atbol (stepOn buf)
432 | otherwise -> is_a_token
434 where s' = s{loc = replaceSrcLine loc y,
438 is_a_token | atbol /=# 0# = lexBOL cont buf s'
439 | otherwise = lexToken cont glaexts buf s'
441 -- {-# LINE .. #-} pragmas. yeuch.
443 case expandWhile# is_space buf of { buf1 ->
444 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
445 -- subtract one: the line number refers to the *following* line.
446 let real_line = line - 1 in
447 case fromInteger real_line of { i@(I# l) ->
448 case expandWhile# is_space buf2 of { buf3 ->
449 case currentChar# buf3 of
451 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
452 let file = lexemeToFastString buf4 in
453 \s@PState{loc=loc} -> skipToEnd buf4 s{loc = mkSrcLoc file i}
455 other -> \s@PState{loc=loc} -> skipToEnd buf3 s{loc = replaceSrcLine loc l}
458 skipToEnd buf = nested_comment cont buf
460 nested_comment :: P a -> P a
461 nested_comment cont buf = loop buf
464 case currentChar# buf of
465 '\NUL'# | bufferExhausted (stepOn buf) ->
466 lexError "unterminated `{-'" buf
468 '-'# | lookAhead# buf 1# `eqChar#` '}'# ->
469 cont (stepOnBy# buf 2#)
471 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
472 nested_comment (nested_comment cont) (stepOnBy# buf 2#)
474 '\n'# -> \ s@PState{loc=loc} ->
475 let buf' = stepOn buf in
476 nested_comment cont buf'
477 s{loc = incSrcLine loc, bol = currentIndex# buf',
480 _ -> nested_comment cont (stepOn buf)
482 -- When we are lexing the first token of a line, check whether we need to
483 -- insert virtual semicolons or close braces due to layout.
485 lexBOL :: (Token -> P a) -> P a
486 lexBOL cont buf s@(PState{
488 glasgow_exts = glaexts,
493 if need_close_curly then
494 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
495 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
496 else if need_semi_colon then
497 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
498 cont ITsemi buf s{atbol = 0#}
500 lexToken cont glaexts buf s{atbol = 0#}
502 col = currentIndex# buf -# bol
515 Layout n -> col ==# n
518 lexToken :: (Token -> P a) -> Int# -> P a
519 lexToken cont glaexts buf =
521 case currentChar# buf of
523 -- special symbols ----------------------------------------------------
524 '('# | flag glaexts && lookAhead# buf 1# `eqChar#` '#'#
525 -> cont IToubxparen (setCurrentPos# buf 2#)
527 -> cont IToparen (incLexeme buf)
529 ')'# -> cont ITcparen (incLexeme buf)
530 '['# -> cont ITobrack (incLexeme buf)
531 ']'# -> cont ITcbrack (incLexeme buf)
532 ','# -> cont ITcomma (incLexeme buf)
533 ';'# -> cont ITsemi (incLexeme buf)
535 '}'# -> \ s@PState{context = ctx} ->
537 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
538 _ -> lexError "too many '}'s" buf s
540 '#'# -> case lookAhead# buf 1# of
541 ')'# | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
542 '-'# -> case lookAhead# buf 2# of
543 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
544 _ -> lex_sym cont (incLexeme buf)
545 _ -> lex_sym cont (incLexeme buf)
547 '`'# | flag glaexts && lookAhead# buf 1# `eqChar#` '`'#
548 -> lex_cstring cont (setCurrentPos# buf 2#)
550 -> cont ITbackquote (incLexeme buf)
552 '{'# -> -- look for "{-##" special iface pragma
553 case lookAhead# buf 1# of
554 '-'# -> case lookAhead# buf 2# of
555 '#'# -> case lookAhead# buf 3# of
558 = doDiscard False (stepOnBy# (stepOverLexeme buf) 4#) in
559 cont (ITpragma lexeme) buf'
560 _ -> lex_prag cont (setCurrentPos# buf 3#)
561 _ -> cont ITocurly (incLexeme buf)
562 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
564 -- strings/characters -------------------------------------------------
565 '\"'#{-"-} -> lex_string cont glaexts "" (incLexeme buf)
566 '\''# -> lex_char (char_end cont) glaexts (incLexeme buf)
568 -- strictness and cpr pragmas and __scc treated specially.
569 '_'# | flag glaexts ->
570 case lookAhead# buf 1# of
571 '_'# -> case lookAhead# buf 2# of
573 lex_demand cont (stepOnUntil (not . isSpace)
574 (stepOnBy# buf 3#)) -- past __S
576 lex_cpr cont (stepOnUntil (not . isSpace)
577 (stepOnBy# buf 3#)) -- past __M
579 case prefixMatch (stepOnBy# buf 3#) "cc" of
580 Just buf' -> lex_scc cont (stepOverLexeme buf')
581 Nothing -> lex_id cont glaexts buf
582 _ -> lex_id cont glaexts buf
583 _ -> lex_id cont glaexts buf
585 -- Hexadecimal and octal constants
586 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
587 -> readNum (after_lexnum cont glaexts) buf' is_hexdigit 16 hex
588 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
589 -> readNum (after_lexnum cont glaexts) buf' is_octdigit 8 oct_or_dec
590 where ch = lookAhead# buf 1#
591 ch2 = lookAhead# buf 2#
592 buf' = setCurrentPos# buf 2#
595 if bufferExhausted (stepOn buf) then
598 trace "lexIface: misplaced NUL?" $
599 cont (ITunknown "\NUL") (stepOn buf)
601 '?'# | flag glaexts && is_lower (lookAhead# buf 1#) ->
602 lex_ip cont (stepOn buf)
603 c | is_digit c -> lex_num cont glaexts 0 buf
604 | is_symbol c -> lex_sym cont buf
605 | is_upper c -> lex_con cont glaexts buf
606 | is_ident c -> lex_id cont glaexts buf
607 | otherwise -> lexError "illegal character" buf
609 -- Int# is unlifted, and therefore faster than Bool for flags.
615 -------------------------------------------------------------------------------
619 = case expandWhile# is_space buf of { buf1 ->
620 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
621 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
622 case lookupUFM pragmaKeywordsFM lexeme of
623 Just kw -> cont kw (mergeLexemes buf buf2)
624 Nothing -> panic "lex_prag"
627 -------------------------------------------------------------------------------
630 lex_string cont glaexts s buf
631 = case currentChar# buf of
633 let buf' = incLexeme buf; s' = mkFastString (reverse s) in
634 case currentChar# buf' of
635 '#'# | flag glaexts -> cont (ITprimstring s') (incLexeme buf')
636 _ -> cont (ITstring s') buf'
638 -- ignore \& in a string, deal with string gaps
639 '\\'# | next_ch `eqChar#` '&'#
640 -> lex_string cont glaexts s buf'
642 -> lex_stringgap cont glaexts s (incLexeme buf)
644 where next_ch = lookAhead# buf 1#
645 buf' = setCurrentPos# buf 2#
647 _ -> lex_char (lex_next_string cont s) glaexts buf
649 lex_stringgap cont glaexts s buf
650 = let buf' = incLexeme buf in
651 case currentChar# buf of
652 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont glaexts s buf'
653 st{loc = incSrcLine loc}
654 '\\'# -> lex_string cont glaexts s buf'
655 c | is_space c -> lex_stringgap cont glaexts s buf'
656 other -> charError buf'
658 lex_next_string cont s glaexts c buf = lex_string cont glaexts (c:s) buf
660 lex_char :: (Int# -> Char -> P a) -> Int# -> P a
661 lex_char cont glaexts buf
662 = case currentChar# buf of
663 '\\'# -> lex_escape (cont glaexts) (incLexeme buf)
664 c | is_any c -> cont glaexts (C# c) (incLexeme buf)
665 other -> charError buf
667 char_end cont glaexts c buf
668 = case currentChar# buf of
669 '\''# -> let buf' = incLexeme buf in
670 case currentChar# buf' of
672 -> cont (ITprimchar c) (incLexeme buf')
673 _ -> cont (ITchar c) buf'
677 = let buf' = incLexeme buf in
678 case currentChar# buf of
679 'a'# -> cont '\a' buf'
680 'b'# -> cont '\b' buf'
681 'f'# -> cont '\f' buf'
682 'n'# -> cont '\n' buf'
683 'r'# -> cont '\r' buf'
684 't'# -> cont '\t' buf'
685 'v'# -> cont '\v' buf'
686 '\\'# -> cont '\\' buf'
687 '"'# -> cont '\"' buf'
688 '\''# -> cont '\'' buf'
689 '^'# -> let c = currentChar# buf' in
690 if c `geChar#` '@'# && c `leChar#` '_'#
691 then cont (C# (chr# (ord# c -# ord# '@'#))) (incLexeme buf')
694 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
695 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
697 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
699 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
700 Just buf2 <- [prefixMatch buf p] ] of
701 (c,buf2):_ -> cont c buf2
704 after_charnum cont i buf
705 = let int = fromInteger i in
706 if i >= 0 && i <= 255
707 then cont (chr int) buf
710 readNum cont buf is_digit base conv = read buf 0
712 = case currentChar# buf of { c ->
714 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
720 || (c `geChar#` 'a'# && c `leChar#` 'f'#)
721 || (c `geChar#` 'A'# && c `leChar#` 'F'#)
723 hex c | is_digit c = ord# c -# ord# '0'#
724 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
725 oct_or_dec c = ord# c -# ord# '0'#
727 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
730 | c `geChar#` 'A'# && c `leChar#` 'Z'#
731 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
734 charError buf = lexError "error in character literal" buf
736 silly_escape_chars = [
773 -------------------------------------------------------------------------------
775 lex_demand cont buf =
776 case read_em [] buf of { (ls,buf') ->
777 case currentChar# buf' of
778 'B'# -> cont (ITstrict (ls, True )) (incLexeme buf')
779 _ -> cont (ITstrict (ls, False)) buf'
782 -- code snatched from Demand.lhs
784 case currentChar# buf of
785 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
786 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
787 'S'# -> read_em (WwStrict : acc) (stepOn buf)
788 'P'# -> read_em (WwPrim : acc) (stepOn buf)
789 'E'# -> read_em (WwEnum : acc) (stepOn buf)
790 ')'# -> (reverse acc, stepOn buf)
791 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
792 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
793 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
794 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
795 _ -> (reverse acc, buf)
797 do_unpack new_or_data wrapper_unpacks acc buf
798 = case read_em [] buf of
799 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
802 case read_em [] buf of { (cpr_inf,buf') ->
803 ASSERT ( null (tail cpr_inf) )
804 cont (ITcprinfo $ head cpr_inf) buf'
807 -- code snatched from lex_demand above
809 case currentChar# buf of
810 '-'# -> read_em (NoCPRInfo : acc) (stepOn buf)
811 '('# -> do_unpack acc (stepOn buf)
812 ')'# -> (reverse acc, stepOn buf)
813 _ -> (reverse acc, buf)
816 = case read_em [] buf of
817 (stuff, rest) -> read_em ((CPRInfo stuff) : acc) rest
821 case currentChar# buf of
822 'C'# -> cont ITsccAllCafs (incLexeme buf)
823 other -> cont ITscc buf
825 -----------------------------------------------------------------------------
828 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
829 lex_num cont glaexts acc buf =
830 case scanNumLit acc buf of
832 case currentChar# buf' of
833 '.'# | is_digit (lookAhead# buf' 1#) ->
834 -- this case is not optimised at all, as the
835 -- presence of floating point numbers in interface
836 -- files is not that common. (ToDo)
837 case expandWhile# is_digit (incLexeme buf') of
838 buf2 -> -- points to first non digit char
840 let l = case currentChar# buf2 of
846 = let buf3 = incLexeme buf2 in
847 case currentChar# buf3 of
848 '-'# -> expandWhile# is_digit (incLexeme buf3)
849 '+'# -> expandWhile# is_digit (incLexeme buf3)
850 x | is_digit x -> expandWhile# is_digit buf3
853 v = readRational__ (lexemeToString l)
855 in case currentChar# l of -- glasgow exts only
856 '#'# | flag glaexts -> let l' = incLexeme l in
857 case currentChar# l' of
858 '#'# -> cont (ITprimdouble v) (incLexeme l')
859 _ -> cont (ITprimfloat v) l'
860 _ -> cont (ITrational v) l
862 _ -> after_lexnum cont glaexts acc' buf'
864 after_lexnum cont glaexts i buf
865 = case currentChar# buf of
866 '#'# | flag glaexts -> cont (ITprimint i) (incLexeme buf)
867 _ -> cont (ITinteger i) buf
869 -----------------------------------------------------------------------------
870 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
872 -- we lexemeToFastString on the bit between the ``''s, but include the
873 -- quotes in the full lexeme.
875 lex_cstring cont buf =
876 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
877 Just buf' -> cont (ITlitlit (lexemeToFastString
878 (setCurrentPos# buf' (negateInt# 2#))))
879 (mergeLexemes buf buf')
880 Nothing -> lexError "unterminated ``" buf
882 ------------------------------------------------------------------------------
885 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
887 {-# INLINE is_ctype #-}
888 #if __GLASGOW_HASKELL__ >= 303
889 is_ctype :: Word8 -> Char# -> Bool
890 is_ctype mask = \c ->
891 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
893 is_ctype :: Int -> Char# -> Bool
894 is_ctype (I# mask) = \c ->
895 let (A# ctype) = ``char_types'' :: Addr
896 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
898 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
901 is_ident = is_ctype 1
902 is_symbol = is_ctype 2
904 is_space = is_ctype 8
905 is_lower = is_ctype 16
906 is_upper = is_ctype 32
907 is_digit = is_ctype 64
909 -----------------------------------------------------------------------------
910 -- identifiers, symbols etc.
913 case expandWhile# is_ident buf of
914 buf' -> cont (ITipvarid lexeme) buf'
915 where lexeme = lexemeToFastString buf'
917 lex_id cont glaexts buf =
918 case expandWhile# is_ident buf of { buf1 ->
920 case (if flag glaexts
921 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
922 else buf1) of { buf' ->
924 let lexeme = lexemeToFastString buf' in
926 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
927 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
931 let var_token = cont (mk_var_token lexeme) buf' in
933 if not (flag glaexts)
937 case lookupUFM ghcExtensionKeywordsFM lexeme of {
938 Just kwd_token -> cont kwd_token buf';
944 case expandWhile# is_symbol buf of
945 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
946 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
947 cont kwd_token buf' ;
948 Nothing -> --trace ("sym: "++unpackFS lexeme) $
949 cont (mk_var_token lexeme) buf'
951 where lexeme = lexemeToFastString buf'
954 lex_con cont glaexts buf =
955 case expandWhile# is_ident buf of { buf1 ->
956 case slurp_trailing_hashes buf1 glaexts of { buf' ->
958 case currentChar# buf' of
963 just_a_conid = --trace ("con: "++unpackFS lexeme) $
964 cont (ITconid lexeme) buf'
965 lexeme = lexemeToFastString buf'
966 munch = lex_qid cont glaexts lexeme (incLexeme buf') just_a_conid
969 lex_qid cont glaexts mod buf just_a_conid =
970 case currentChar# buf of
971 '['# -> -- Special case for []
972 case lookAhead# buf 1# of
973 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
976 '('# -> -- Special case for (,,,)
977 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
978 case lookAhead# buf 1# of
979 '#'# | flag glaexts -> case lookAhead# buf 2# of
980 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
983 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
984 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
987 '-'# -> case lookAhead# buf 1# of
988 '>'# -> cont (ITqconid (mod,SLIT("->"))) (setCurrentPos# buf 2#)
989 _ -> lex_id3 cont glaexts mod buf just_a_conid
990 _ -> lex_id3 cont glaexts mod buf just_a_conid
992 lex_id3 cont glaexts mod buf just_a_conid
993 | is_symbol (currentChar# buf) =
995 start_new_lexeme = stepOverLexeme buf
997 case expandWhile# is_symbol start_new_lexeme of { buf' ->
999 lexeme = lexemeToFastString buf'
1000 -- real lexeme is M.<sym>
1001 new_buf = mergeLexemes buf buf'
1003 cont (mk_qvar_token mod lexeme) new_buf
1004 -- wrong, but arguably morally right: M... is now a qvarsym
1009 start_new_lexeme = stepOverLexeme buf
1011 case expandWhile# is_ident start_new_lexeme of { buf1 ->
1016 case slurp_trailing_hashes buf1 glaexts of { buf' ->
1019 lexeme = lexemeToFastString buf'
1020 new_buf = mergeLexemes buf buf'
1021 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1023 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1024 Just kwd_token -> just_a_conid; -- avoid M.where etc.
1025 Nothing -> is_a_qvarid
1026 -- TODO: special ids (as, qualified, hiding) shouldn't be
1027 -- recognised as keywords here. ie. M.as is a qualified varid.
1031 slurp_trailing_hashes buf glaexts
1032 | flag glaexts = expandWhile# (`eqChar#` '#'#) buf
1037 | is_upper f = ITconid pk_str
1038 -- _[A-Z] is treated as a constructor in interface files.
1039 | f `eqChar#` '_'# && not (_NULL_ tl)
1040 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
1041 | is_ident f = ITvarid pk_str
1042 | f `eqChar#` ':'# = ITconsym pk_str
1043 | otherwise = ITvarsym pk_str
1045 (C# f) = _HEAD_ pk_str
1048 mk_qvar_token m token =
1049 case mk_var_token token of
1050 ITconid n -> ITqconid (m,n)
1051 ITvarid n -> ITqvarid (m,n)
1052 ITconsym n -> ITqconsym (m,n)
1053 ITvarsym n -> ITqvarsym (m,n)
1054 _ -> ITunknown (show token)
1057 ----------------------------------------------------------------------------
1058 Horrible stuff for dealing with M.(,,,)
1061 lex_tuple cont mod buf back_off =
1065 case currentChar# buf of
1066 ','# -> go (n+1) (stepOn buf)
1067 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n))) (stepOn buf)
1070 lex_ubx_tuple cont mod buf back_off =
1074 case currentChar# buf of
1075 ','# -> go (n+1) (stepOn buf)
1076 '#'# -> case lookAhead# buf 1# of
1077 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n)))
1083 -----------------------------------------------------------------------------
1084 doDiscard rips along really fast, looking for a '#-}',
1085 indicating the end of the pragma we're skipping
1088 doDiscard inStr buf =
1089 case currentChar# buf of
1091 case lookAhead# buf 1# of { '#'# ->
1092 case lookAhead# buf 2# of { '-'# ->
1093 case lookAhead# buf 3# of { '}'# ->
1094 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
1095 _ -> doDiscard inStr (incLexeme buf) };
1096 _ -> doDiscard inStr (incLexeme buf) };
1097 _ -> doDiscard inStr (incLexeme buf) }
1100 odd_slashes buf flg i# =
1101 case lookAhead# buf i# of
1102 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1105 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1106 '\\'# -> -- escaping something..
1107 if odd_slashes buf True (negateInt# 2#) then
1108 -- odd number of slashes, " is escaped.
1109 doDiscard inStr (incLexeme buf)
1111 -- even number of slashes, \ is escaped.
1112 doDiscard (not inStr) (incLexeme buf)
1113 _ -> case inStr of -- forced to avoid build-up
1114 True -> doDiscard False (incLexeme buf)
1115 False -> doDiscard True (incLexeme buf)
1116 _ -> doDiscard inStr (incLexeme buf)
1120 -----------------------------------------------------------------------------
1131 data PState = PState {
1133 glasgow_exts :: Int#,
1136 context :: [LayoutContext]
1139 type P a = StringBuffer -- Input string
1144 returnP a buf s = POk s a
1146 thenP :: P a -> (a -> P b) -> P b
1147 m `thenP` k = \ buf s ->
1149 POk s1 a -> k a buf s1
1150 PFailed err -> PFailed err
1152 thenP_ :: P a -> P b -> P b
1153 m `thenP_` k = m `thenP` \_ -> k
1155 mapP :: (a -> P b) -> [a] -> P [b]
1156 mapP f [] = returnP []
1159 mapP f as `thenP` \bs ->
1162 failP :: String -> P a
1163 failP msg buf s = PFailed (text msg)
1165 failMsgP :: Message -> P a
1166 failMsgP msg buf s = PFailed msg
1168 lexError :: String -> P a
1169 lexError str buf s@PState{ loc = loc }
1170 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1172 getSrcLocP :: P SrcLoc
1173 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1175 getSrcFile :: P FAST_STRING
1176 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1178 getContext :: P [LayoutContext]
1179 getContext buf s@(PState{ context = ctx }) = POk s ctx
1181 pushContext :: LayoutContext -> P ()
1182 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1186 This special case in layoutOn is to handle layout contexts with are
1187 indented the same or less than the current context. This is illegal
1188 according to the Haskell spec, so we have to arrange to close the
1189 current context. eg.
1194 after the first 'where', the sequence of events is:
1196 - layout system inserts a ';' (column 0)
1197 - parser begins a new context at column 0
1198 - parser shifts ';' (legal empty declaration)
1199 - parser sees 'class': parse error (we're still in the inner context)
1201 trouble is, by the time we know we need a new context, the lexer has
1202 already generated the ';'. Hacky solution is as follows: since we
1203 know the column of the next token (it's the column number of the new
1204 context), we set the ACTUAL column number of the new context to this
1205 numer plus one. Hence the next time the lexer is called, a '}' will
1206 be generated to close the new context straight away. Furthermore, we
1207 have to set the atbol flag so that the ';' that the parser shifted as
1208 part of the new context is re-generated.
1210 when the new context is *less* indented than the current one:
1212 f = f where g = g where
1215 - current context: column 12.
1216 - on seeing 'h' (column 0), the layout system inserts '}'
1217 - parser starts a new context, column 0
1218 - parser sees '}', uses it to close new context
1219 - we still need to insert another '}' followed by a ';',
1220 hence the atbol trick.
1222 There's also a special hack in here to deal with
1229 i.e. the inner context is at the same indentation level as the outer
1230 context. This is strictly illegal according to Haskell 98, but
1231 there's a lot of existing code using this style and it doesn't make
1232 any sense to disallow it, since empty 'do' lists don't make sense.
1235 layoutOn :: Bool -> P ()
1236 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1237 let offset = lexemeIndex buf -# bol in
1240 | if strict then prev_off >=# offset else prev_off ># offset ->
1241 --trace ("layout on, column: " ++ show (I# offset)) $
1242 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1244 --trace ("layout on, column: " ++ show (I# offset)) $
1245 POk s{ context = Layout offset : ctx } ()
1248 layoutOff buf s@(PState{ context = ctx }) =
1249 POk s{ context = NoLayout:ctx } ()
1252 popContext = \ buf s@(PState{ context = ctx }) ->
1254 (_:tl) -> POk s{ context = tl } ()
1255 [] -> panic "Lex.popContext: empty context"
1258 Note that if the name of the file we're processing ends
1259 with `hi-boot', we accept it on faith as having the right
1260 version. This is done so that .hi-boot files that comes
1261 with hsc don't have to be updated before every release,
1262 *and* it allows us to share .hi-boot files with versions
1263 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1265 If the version number is 0, the checking is also turned off.
1266 (needed to deal with GHC.hi only!)
1268 Once we can assume we're compiling with a version of ghc that
1269 supports interface file checking, we can drop the special
1272 checkVersion :: Maybe Integer -> P ()
1273 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1274 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1275 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1276 checkVersion mb@Nothing buf s@(PState{loc = loc})
1277 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1278 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1280 -----------------------------------------------------------------
1282 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1284 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1285 text (lexemeToString s), char '\'']
1287 ifaceVersionErr hi_vers l toks
1288 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1289 ptext SLIT("Expected"), int opt_HiVersion,
1290 ptext SLIT("found "), pp_version]
1294 Nothing -> ptext SLIT("pre ghc-3.02 version")
1295 Just v -> ptext SLIT("version") <+> integer v