2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
4 \section[Lexical analysis]{Lexical analysis}
6 --------------------------------------------------------
8 There's a known bug in here:
10 If an interface file ends prematurely, Lex tries to
11 do headFS of an empty FastString.
13 An example that provokes the error is
15 f _:_ _forall_ [a] <<<END OF FILE>>>
16 --------------------------------------------------------
19 {-# OPTIONS -#include "ctypes.h" #-}
26 Token(..), lexer, ParseResult(..), PState(..),
30 P, thenP, thenP_, returnP, mapP, failP, failMsgP,
31 getSrcLocP, getSrcFile,
32 layoutOn, layoutOff, pushContext, popContext
35 #include "HsVersions.h"
37 import Char ( ord, isSpace, toUpper )
38 import List ( isSuffixOf )
40 import IdInfo ( InlinePragInfo(..), CprInfo(..) )
41 import Name ( isLowerISO, isUpperISO )
42 import PrelMods ( mkTupNameStr, mkUbxTupNameStr )
43 import CmdLineOpts ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
44 import Demand ( Demand(..) {- instance Read -} )
45 import UniqFM ( UniqFM, listToUFM, lookupUFM)
46 import BasicTypes ( NewOrData(..) )
47 import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
48 replaceSrcLine, mkSrcLoc )
50 import Maybes ( MaybeErr(..) )
51 import ErrUtils ( Message )
59 #if __GLASGOW_HASKELL__ >= 303
66 import PrelRead ( readRational__ ) -- Glasgow non-std
69 %************************************************************************
71 \subsection{Data types}
73 %************************************************************************
75 The token data type, fairly un-interesting except from one
76 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
77 strictness, unfolding etc).
79 The Idea/Observation here is that the renamer needs to scan through
80 all of an interface file before it can continue. But only a fraction
81 of the information contained in the file turns out to be useful, so
82 delaying as much as possible of the scanning and parsing of an
83 interface file Makes Sense (Heap profiles of the compiler
84 show a reduction in heap usage by at least a factor of two,
87 Hence, the interface file lexer spots when value declarations are
88 being scanned and return the @ITidinfo@ and @ITtype@ constructors
89 for the type and any other id info for that binding (unfolding, strictness
90 etc). These constructors are applied to the result of lexing these sub-chunks.
92 The lexing of the type and id info is all done lazily, of course, so
93 the scanning (and subsequent parsing) will be done *only* on the ids the
94 renamer finds out that it is interested in. The rest will just be junked.
95 Laziness, you know it makes sense :-)
99 = ITcase -- Haskell keywords
122 | ITforall -- GHC extension keywords
129 | ITinterface -- interface keywords
137 | ITccall (Bool,Bool,Bool) -- (is_dyn, is_casm, may_gc)
152 | ITunfold InlinePragInfo
153 | ITstrict ([Demand], Bool)
155 | ITcprinfo (CprInfo)
159 | ITspecialise_prag -- Pragmas
167 | ITdotdot -- reserved symbols
181 | ITbiglam -- GHC-extension symbols
183 | ITocurly -- special symbols
197 | ITvarid FAST_STRING -- identifiers
198 | ITconid FAST_STRING
199 | ITvarsym FAST_STRING
200 | ITconsym FAST_STRING
201 | ITqvarid (FAST_STRING,FAST_STRING)
202 | ITqconid (FAST_STRING,FAST_STRING)
203 | ITqvarsym (FAST_STRING,FAST_STRING)
204 | ITqconsym (FAST_STRING,FAST_STRING)
206 | ITpragma StringBuffer
209 | ITstring FAST_STRING
211 | ITrational Rational
214 | ITprimstring FAST_STRING
216 | ITprimfloat Rational
217 | ITprimdouble Rational
218 | ITlitlit FAST_STRING
220 | ITunknown String -- Used when the lexer can't make sense of it
221 | ITeof -- end of file token
222 deriving Text -- debugging
225 -----------------------------------------------------------------------------
229 pragmaKeywordsFM = listToUFM $
230 map (\ (x,y) -> (_PK_ x,y))
231 [( "SPECIALISE", ITspecialise_prag ),
232 ( "SPECIALIZE", ITspecialise_prag ),
233 ( "SOURCE", ITsource_prag ),
234 ( "INLINE", ITinline_prag ),
235 ( "NOINLINE", ITnoinline_prag ),
236 ( "NOTINLINE", ITnoinline_prag ),
237 ( "LINE", ITline_prag ),
238 ( "RULES", ITrules_prag ),
239 ( "RULEZ", ITrules_prag ) -- american spelling :-)
242 haskellKeywordsFM = listToUFM $
243 map (\ (x,y) -> (_PK_ x,y))
244 [( "_", ITunderscore ),
246 ( "class", ITclass ),
248 ( "default", ITdefault ),
249 ( "deriving", ITderiving ),
253 ( "import", ITimport ),
255 ( "infix", ITinfix ),
256 ( "infixl", ITinfixl ),
257 ( "infixr", ITinfixr ),
258 ( "instance", ITinstance ),
260 ( "module", ITmodule ),
261 ( "newtype", ITnewtype ),
265 ( "where", ITwhere ),
269 ghcExtensionKeywordsFM = listToUFM $
270 map (\ (x,y) -> (_PK_ x,y))
271 [ ( "forall", ITforall ),
272 ( "foreign", ITforeign ),
273 ( "export", ITexport ),
274 ( "label", ITlabel ),
275 ( "dynamic", ITdynamic ),
276 ( "unsafe", ITunsafe ),
277 ("_ccall_", ITccall (False, False, False)),
278 ("_ccall_GC_", ITccall (False, False, True)),
279 ("_casm_", ITccall (False, True, False)),
280 ("_casm_GC_", ITccall (False, True, True)),
282 -- interface keywords
283 ("__interface", ITinterface),
284 ("__export", IT__export),
285 ("__depends", ITdepends),
286 ("__forall", IT__forall),
287 ("__letrec", ITletrec),
288 ("__coerce", ITcoerce),
289 ("__inline_me", ITinlineMe),
290 ("__inline_call", ITinlineCall),
291 ("__depends", ITdepends),
292 ("__DEFAULT", ITdefaultbranch),
294 ("__integer", ITinteger_lit),
295 ("__float", ITfloat_lit),
296 ("__rational", ITrational_lit),
297 ("__addr", ITaddr_lit),
298 ("__litlit", ITlit_lit),
299 ("__string", ITstring_lit),
304 ("__P", ITspecialise),
307 ("__u", ITunfold NoInlinePragInfo),
309 ("__ccall", ITccall (False, False, False)),
310 ("__ccall_GC", ITccall (False, False, True)),
311 ("__dyn_ccall", ITccall (True, False, False)),
312 ("__dyn_ccall_GC", ITccall (True, False, True)),
313 ("__casm", ITccall (False, True, False)),
314 ("__dyn_casm", ITccall (True, True, False)),
315 ("__casm_GC", ITccall (False, True, True)),
316 ("__dyn_casm_GC", ITccall (True, True, True)),
322 haskellKeySymsFM = listToUFM $
323 map (\ (x,y) -> (_PK_ x,y))
336 ,(".", ITdot) -- sadly, for 'forall a . t'
339 not_special_op ITminus = False
340 not_special_op ITbang = False
341 not_special_op _ = True
344 -----------------------------------------------------------------------------
349 - (glaexts) lexing an interface file or -fglasgow-exts
350 - (bol) pointer to beginning of line (for column calculations)
351 - (buf) pointer to beginning of token
352 - (buf) pointer to current char
353 - (atbol) flag indicating whether we're at the beginning of a line
356 lexer :: (Token -> P a) -> P a
357 lexer cont buf s@(PState{
359 glasgow_exts = glaexts,
365 -- first, start a new lexeme and lose all the whitespace
366 = tab line bol atbol (stepOverLexeme buf)
368 line = srcLocLine loc
370 tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
371 case currentChar# buf of
374 if bufferExhausted (stepOn buf)
375 then cont ITeof buf s'
376 else trace "lexer: misplaced NUL?" $
377 tab y bol atbol (stepOn buf)
379 '\n'# -> let buf' = stepOn buf
380 in tab (y +# 1#) (currentIndex# buf') 1# buf'
382 -- find comments. This got harder in Haskell 98.
383 '-'# -> let trundle n =
384 let next = lookAhead# buf n in
385 if next `eqChar#` '-'# then trundle (n +# 1#)
386 else if is_symbol next || n <# 2#
388 else case untilChar# (stepOnBy# buf n) '\n'# of
389 { buf' -> tab y bol atbol (stepOverLexeme buf')
393 -- comments and pragmas. We deal with LINE pragmas here,
394 -- and throw out any unrecognised pragmas as comments. Any
395 -- pragmas we know about are dealt with later (after any layout
396 -- processing if necessary).
398 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
399 if lookAhead# buf 2# `eqChar#` '#'# then
400 if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
401 case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
402 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2->
403 let lexeme = mkFastString -- ToDo: too slow
404 (map toUpper (lexemeToString buf2)) in
405 case lookupUFM pragmaKeywordsFM lexeme of
406 Just ITline_prag -> line_prag (lexer cont) buf2 s'
407 Just other -> is_a_token
408 Nothing -> skip_to_end (stepOnBy# buf 2#)
411 else skip_to_end (stepOnBy# buf 2#)
413 skip_to_end buf = nested_comment (lexer cont) buf s'
415 -- tabs have been expanded beforehand
416 c | is_space c -> tab y bol atbol (stepOn buf)
417 | otherwise -> is_a_token
419 where s' = s{loc = replaceSrcLine loc y,
423 is_a_token | atbol /=# 0# = lexBOL cont buf s'
424 | otherwise = lexToken cont glaexts buf s'
426 -- {-# LINE .. #-} pragmas. yeuch.
428 case expandWhile# is_space buf of { buf1 ->
429 case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) ->
430 -- subtract one: the line number refers to the *following* line.
431 let real_line = line - 1 in
432 case fromInteger real_line of { i@(I# l) ->
433 case expandWhile# is_space buf2 of { buf3 ->
434 case currentChar# buf3 of
436 case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
437 let file = lexemeToFastString buf4 in
438 \s@PState{loc=loc} -> skipToEnd buf4 s{loc = mkSrcLoc file i}
440 other -> \s@PState{loc=loc} -> skipToEnd buf3 s{loc = replaceSrcLine loc l}
443 skipToEnd buf = nested_comment cont buf
445 nested_comment :: P a -> P a
446 nested_comment cont buf = loop buf
449 case currentChar# buf of
450 '\NUL'# | bufferExhausted (stepOn buf) ->
451 lexError "unterminated `{-'" buf
453 '-'# | lookAhead# buf 1# `eqChar#` '}'# ->
454 cont (stepOnBy# buf 2#)
456 '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
457 nested_comment (nested_comment cont) (stepOnBy# buf 2#)
459 '\n'# -> \ s@PState{loc=loc} ->
460 let buf' = stepOn buf in
461 nested_comment cont buf'
462 s{loc = incSrcLine loc, bol = currentIndex# buf',
465 _ -> nested_comment cont (stepOn buf)
467 -- When we are lexing the first token of a line, check whether we need to
468 -- insert virtual semicolons or close braces due to layout.
470 lexBOL :: (Token -> P a) -> P a
471 lexBOL cont buf s@(PState{
473 glasgow_exts = glaexts,
478 if need_close_curly then
479 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
480 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
481 else if need_semi_colon then
482 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
483 cont ITsemi buf s{atbol = 0#}
485 lexToken cont glaexts buf s{atbol = 0#}
487 col = currentIndex# buf -# bol
500 Layout n -> col ==# n
503 lexToken :: (Token -> P a) -> Int# -> P a
504 lexToken cont glaexts buf =
507 case currentChar# buf of
509 -- special symbols ----------------------------------------------------
510 '('# | flag glaexts && lookAhead# buf 1# `eqChar#` '#'#
511 -> cont IToubxparen (setCurrentPos# buf 2#)
513 -> cont IToparen (incLexeme buf)
515 ')'# -> cont ITcparen (incLexeme buf)
516 '['# -> cont ITobrack (incLexeme buf)
517 ']'# -> cont ITcbrack (incLexeme buf)
518 ','# -> cont ITcomma (incLexeme buf)
519 ';'# -> cont ITsemi (incLexeme buf)
521 '}'# -> \ s@PState{context = ctx} ->
523 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
524 _ -> lexError "too many '}'s" buf s
526 '#'# -> case lookAhead# buf 1# of
527 ')'# | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
528 '-'# -> case lookAhead# buf 2# of
529 '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
530 _ -> lex_sym cont (incLexeme buf)
531 _ -> lex_sym cont (incLexeme buf)
533 '`'# | flag glaexts && lookAhead# buf 1# `eqChar#` '`'#
534 -> lex_cstring cont (setCurrentPos# buf 2#)
536 -> cont ITbackquote (incLexeme buf)
538 '{'# -> -- look for "{-##" special iface pragma
539 case lookAhead# buf 1# of
540 '-'# -> case lookAhead# buf 2# of
541 '#'# -> case lookAhead# buf 3# of
544 = doDiscard False (stepOnBy# (stepOverLexeme buf) 4#) in
545 cont (ITpragma lexeme) buf'
546 _ -> lex_prag cont (setCurrentPos# buf 3#)
547 _ -> cont ITocurly (incLexeme buf)
548 _ -> (layoutOff `thenP_` cont ITocurly) (incLexeme buf)
550 -- strings/characters -------------------------------------------------
551 '\"'#{-"-} -> lex_string cont glaexts "" (incLexeme buf)
552 '\''# -> lex_char (char_end cont) glaexts (incLexeme buf)
554 -- strictness and cpr pragmas and __scc treated specially.
555 '_'# | flag glaexts ->
556 case lookAhead# buf 1# of
557 '_'# -> case lookAhead# buf 2# of
559 lex_demand cont (stepOnUntil (not . isSpace)
560 (stepOnBy# buf 3#)) -- past __S
562 lex_cpr cont (stepOnUntil (not . isSpace)
563 (stepOnBy# buf 3#)) -- past __M
565 case prefixMatch (stepOnBy# buf 3#) "cc" of
566 Just buf' -> lex_scc cont (stepOverLexeme buf')
567 Nothing -> lex_id cont glaexts buf
568 _ -> lex_id cont glaexts buf
569 _ -> lex_id cont glaexts buf
571 -- Hexadecimal and octal constants
572 '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
573 -> readNum (after_lexnum cont glaexts) buf' is_hexdigit 16 hex
574 | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
575 -> readNum (after_lexnum cont glaexts) buf' is_octdigit 8 oct_or_dec
576 where ch = lookAhead# buf 1#
577 ch2 = lookAhead# buf 2#
578 buf' = setCurrentPos# buf 2#
581 if bufferExhausted (stepOn buf) then
584 trace "lexIface: misplaced NUL?" $
585 cont (ITunknown "\NUL") (stepOn buf)
587 c | is_digit c -> lex_num cont glaexts 0 buf
588 | is_symbol c -> lex_sym cont buf
589 | is_upper c -> lex_con cont glaexts buf
590 | is_ident c -> lex_id cont glaexts buf
591 | otherwise -> lexError "illegal character" buf
593 -- Int# is unlifted, and therefore faster than Bool for flags.
599 -------------------------------------------------------------------------------
603 = case expandWhile# is_space buf of { buf1 ->
604 case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
605 let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
606 case lookupUFM pragmaKeywordsFM lexeme of
607 Just kw -> cont kw (mergeLexemes buf buf2)
608 Nothing -> panic "lex_prag"
611 -------------------------------------------------------------------------------
614 lex_string cont glaexts s buf
615 = case currentChar# buf of
617 let buf' = incLexeme buf; s' = mkFastString (reverse s) in
618 case currentChar# buf' of
619 '#'# | flag glaexts -> cont (ITprimstring s') (incLexeme buf')
620 _ -> cont (ITstring s') buf'
622 -- ignore \& in a string, deal with string gaps
623 '\\'# | next_ch `eqChar#` '&'#
624 -> lex_string cont glaexts s (setCurrentPos# buf 2#)
626 -> lex_stringgap cont glaexts s buf'
628 where next_ch = lookAhead# buf 1#
629 buf' = setCurrentPos# buf 2#
631 _ -> lex_char (lex_next_string cont s) glaexts buf
633 lex_stringgap cont glaexts s buf
634 = let buf' = incLexeme buf in
635 case currentChar# buf of
636 '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont glaexts s buf'
637 st{loc = incSrcLine loc}
638 '\\'# -> lex_string cont glaexts s buf'
639 c | is_space c -> lex_stringgap cont glaexts s buf'
640 other -> charError buf'
642 lex_next_string cont s glaexts c buf = lex_string cont glaexts (c:s) buf
644 lex_char :: (Int# -> Char -> P a) -> Int# -> P a
645 lex_char cont glaexts buf
646 = case currentChar# buf of
647 '\\'# -> lex_escape (cont glaexts) (incLexeme buf)
648 c | is_any c -> cont glaexts (C# c) (incLexeme buf)
649 other -> charError buf
651 char_end cont glaexts c buf
652 = case currentChar# buf of
653 '\''# -> let buf' = incLexeme buf in
654 case currentChar# buf' of
656 -> cont (ITprimchar c) (incLexeme buf')
657 _ -> cont (ITchar c) buf'
661 = let buf' = incLexeme buf in
662 case currentChar# buf of
663 'a'# -> cont '\a' buf'
664 'b'# -> cont '\b' buf'
665 'f'# -> cont '\f' buf'
666 'n'# -> cont '\n' buf'
667 'r'# -> cont '\r' buf'
668 't'# -> cont '\t' buf'
669 'v'# -> cont '\v' buf'
670 '\\'# -> cont '\\' buf'
671 '"'# -> cont '\"' buf'
672 '\''# -> cont '\'' buf'
673 '^'# -> let c = currentChar# buf' in
674 if c `geChar#` '@'# && c `leChar#` '_'#
675 then cont (C# (chr# (ord# c -# ord# '@'#))) (incLexeme buf')
678 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
679 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec
681 -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec
683 _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
684 Just buf2 <- [prefixMatch buf p] ] of
685 (c,buf2):_ -> cont c buf2
688 after_charnum cont i buf
689 = let int = fromInteger i in
690 if i >= 0 && i <= 255
691 then cont (chr int) buf
694 readNum cont buf is_digit base conv = read buf 0
696 = case currentChar# buf of { c ->
698 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
704 || (c `geChar#` 'a'# && c `leChar#` 'h'#)
705 || (c `geChar#` 'A'# && c `leChar#` 'H'#)
707 hex c | is_digit c = ord# c -# ord# '0'#
708 | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10#
709 oct_or_dec c = ord# c -# ord# '0'#
711 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
714 | c `geChar#` 'A'# && c `leChar#` 'Z'#
715 = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
718 charError buf = lexError "error in character literal" buf
720 silly_escape_chars = [
757 -------------------------------------------------------------------------------
759 lex_demand cont buf =
760 case read_em [] buf of { (ls,buf') ->
761 case currentChar# buf' of
762 'B'# -> cont (ITstrict (ls, True )) (incLexeme buf')
763 _ -> cont (ITstrict (ls, False)) buf'
766 -- code snatched from Demand.lhs
768 case currentChar# buf of
769 'L'# -> read_em (WwLazy False : acc) (stepOn buf)
770 'A'# -> read_em (WwLazy True : acc) (stepOn buf)
771 'S'# -> read_em (WwStrict : acc) (stepOn buf)
772 'P'# -> read_em (WwPrim : acc) (stepOn buf)
773 'E'# -> read_em (WwEnum : acc) (stepOn buf)
774 ')'# -> (reverse acc, stepOn buf)
775 'U'# -> do_unpack DataType True acc (stepOnBy# buf 2#)
776 'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
777 'N'# -> do_unpack NewType True acc (stepOnBy# buf 2#)
778 'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
779 _ -> (reverse acc, buf)
781 do_unpack new_or_data wrapper_unpacks acc buf
782 = case read_em [] buf of
783 (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
786 case read_em [] buf of { (cpr_inf,buf') ->
787 ASSERT ( null (tail cpr_inf) )
788 cont (ITcprinfo $ head cpr_inf) buf'
791 -- code snatched from lex_demand above
793 case currentChar# buf of
794 '-'# -> read_em (NoCPRInfo : acc) (stepOn buf)
795 '('# -> do_unpack acc (stepOn buf)
796 ')'# -> (reverse acc, stepOn buf)
797 _ -> (reverse acc, buf)
800 = case read_em [] buf of
801 (stuff, rest) -> read_em ((CPRInfo stuff) : acc) rest
805 case currentChar# buf of
806 'C'# -> cont ITsccAllCafs (incLexeme buf)
807 other -> cont ITscc buf
809 -----------------------------------------------------------------------------
812 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
813 lex_num cont glaexts acc buf =
814 case scanNumLit acc buf of
816 case currentChar# buf' of
817 '.'# | is_digit (lookAhead# buf' 1#) ->
818 -- this case is not optimised at all, as the
819 -- presence of floating point numbers in interface
820 -- files is not that common. (ToDo)
821 case expandWhile# is_digit (incLexeme buf') of
822 buf2 -> -- points to first non digit char
824 let l = case currentChar# buf2 of
830 = let buf3 = incLexeme buf2 in
831 case currentChar# buf3 of
832 '-'# -> expandWhile# is_digit (incLexeme buf3)
833 '+'# -> expandWhile# is_digit (incLexeme buf3)
834 x | is_digit x -> expandWhile# is_digit buf3
837 v = readRational__ (lexemeToString l)
839 in case currentChar# l of -- glasgow exts only
840 '#'# | flag glaexts -> let l' = incLexeme l in
841 case currentChar# l' of
842 '#'# -> cont (ITprimdouble v) (incLexeme l')
843 _ -> cont (ITprimfloat v) l'
844 _ -> cont (ITrational v) l
846 _ -> after_lexnum cont glaexts acc' buf'
848 after_lexnum cont glaexts i buf
849 = case currentChar# buf of
850 '#'# | flag glaexts -> cont (ITprimint i) (incLexeme buf)
851 _ -> cont (ITinteger i) buf
853 -----------------------------------------------------------------------------
854 -- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.)
856 -- we lexemeToFastString on the bit between the ``''s, but include the
857 -- quotes in the full lexeme.
859 lex_cstring cont buf =
860 case expandUntilMatch (stepOverLexeme buf) "\'\'" of
861 buf' -> cont (ITlitlit (lexemeToFastString
862 (setCurrentPos# buf' (negateInt# 2#))))
863 (mergeLexemes buf buf')
865 ------------------------------------------------------------------------------
868 is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
870 {-# INLINE is_ctype #-}
871 #if __GLASGOW_HASKELL__ >= 303
872 is_ctype :: Word8 -> Char# -> Bool
873 is_ctype mask = \c ->
874 (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
876 is_ctype :: Int -> Char# -> Bool
877 is_ctype (I# mask) = \c ->
878 let (A# ctype) = ``char_types'' :: Addr
879 flag_word = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
881 (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
884 is_ident = is_ctype 1
885 is_symbol = is_ctype 2
887 is_space = is_ctype 8
888 is_upper = is_ctype 16
889 is_digit = is_ctype 32
891 -----------------------------------------------------------------------------
892 -- identifiers, symbols etc.
894 lex_id cont glaexts buf =
895 case expandWhile# is_ident buf of { buf1 ->
897 case (if flag glaexts
898 then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
899 else buf1) of { buf' ->
901 let lexeme = lexemeToFastString buf' in
903 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
904 Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
908 let var_token = cont (mk_var_token lexeme) buf' in
910 if not (flag glaexts)
914 case lookupUFM ghcExtensionKeywordsFM lexeme of {
915 Just kwd_token -> cont kwd_token buf';
921 case expandWhile# is_symbol buf of
922 buf' -> case lookupUFM haskellKeySymsFM lexeme of {
923 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
924 cont kwd_token buf' ;
925 Nothing -> --trace ("sym: "++unpackFS lexeme) $
926 cont (mk_var_token lexeme) buf'
928 where lexeme = lexemeToFastString buf'
931 lex_con cont glaexts buf =
932 case expandWhile# is_ident buf of { buf1 ->
933 case slurp_trailing_hashes buf1 glaexts of { buf' ->
935 case currentChar# buf' of
940 just_a_conid = --trace ("con: "++unpackFS lexeme) $
941 cont (ITconid lexeme) buf'
942 lexeme = lexemeToFastString buf'
943 munch = lex_qid cont glaexts lexeme (incLexeme buf') just_a_conid
946 lex_qid cont glaexts mod buf just_a_conid =
947 case currentChar# buf of
948 '['# -> -- Special case for []
949 case lookAhead# buf 1# of
950 ']'# -> cont (ITqconid (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
953 '('# -> -- Special case for (,,,)
954 -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
955 case lookAhead# buf 1# of
956 '#'# | flag glaexts -> case lookAhead# buf 2# of
957 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
960 ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
961 ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
964 '-'# -> case lookAhead# buf 1# of
965 '>'# -> cont (ITqconid (mod,SLIT("->"))) (setCurrentPos# buf 2#)
966 _ -> lex_id3 cont glaexts mod buf just_a_conid
967 _ -> lex_id3 cont glaexts mod buf just_a_conid
969 lex_id3 cont glaexts mod buf just_a_conid
970 | is_symbol (currentChar# buf) =
972 start_new_lexeme = stepOverLexeme buf
974 case expandWhile# is_symbol start_new_lexeme of { buf' ->
976 lexeme = lexemeToFastString buf'
977 -- real lexeme is M.<sym>
978 new_buf = mergeLexemes buf buf'
980 case lookupUFM haskellKeySymsFM lexeme of {
981 Just kwd_token | not_special_op kwd_token
982 -> just_a_conid; -- avoid M.::, but not M.!
983 other -> cont (mk_qvar_token mod lexeme) new_buf
988 start_new_lexeme = stepOverLexeme buf
990 case expandWhile# is_ident start_new_lexeme of { buf1 ->
995 case slurp_trailing_hashes buf1 glaexts of { buf' ->
998 lexeme = lexemeToFastString buf'
999 new_buf = mergeLexemes buf buf'
1000 is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1002 case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1003 Just kwd_token -> just_a_conid; -- avoid M.where etc.
1004 Nothing -> is_a_qvarid
1008 slurp_trailing_hashes buf glaexts
1009 | flag glaexts = expandWhile# (`eqChar#` '#'#) buf
1014 | is_upper f = ITconid pk_str
1015 -- _[A-Z] is treated as a constructor in interface files.
1016 | f `eqChar#` '_'# && not (_NULL_ tl)
1017 && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
1018 | is_ident f = ITvarid pk_str
1019 | f `eqChar#` ':'# = ITconsym pk_str
1020 | otherwise = ITvarsym pk_str
1022 (C# f) = _HEAD_ pk_str
1025 mk_qvar_token m token =
1026 case mk_var_token token of
1027 ITconid n -> ITqconid (m,n)
1028 ITvarid n -> ITqvarid (m,n)
1029 ITconsym n -> ITqconsym (m,n)
1030 ITvarsym n -> ITqvarsym (m,n)
1031 _ -> ITunknown (show token)
1034 ----------------------------------------------------------------------------
1035 Horrible stuff for dealing with M.(,,,)
1038 lex_tuple cont mod buf back_off =
1042 case currentChar# buf of
1043 ','# -> go (n+1) (stepOn buf)
1044 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n))) (stepOn buf)
1047 lex_ubx_tuple cont mod buf back_off =
1051 case currentChar# buf of
1052 ','# -> go (n+1) (stepOn buf)
1053 '#'# -> case lookAhead# buf 1# of
1054 ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n)))
1060 -----------------------------------------------------------------------------
1061 doDiscard rips along really fast, looking for a '#-}',
1062 indicating the end of the pragma we're skipping
1065 doDiscard inStr buf =
1066 case currentChar# buf of
1068 case lookAhead# buf 1# of { '#'# ->
1069 case lookAhead# buf 2# of { '-'# ->
1070 case lookAhead# buf 3# of { '}'# ->
1071 (lexemeToBuffer buf, stepOverLexeme (setCurrentPos# buf 4#));
1072 _ -> doDiscard inStr (incLexeme buf) };
1073 _ -> doDiscard inStr (incLexeme buf) };
1074 _ -> doDiscard inStr (incLexeme buf) }
1077 odd_slashes buf flg i# =
1078 case lookAhead# buf i# of
1079 '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1082 case lookAhead# buf (negateInt# 1#) of --backwards, actually
1083 '\\'# -> -- escaping something..
1084 if odd_slashes buf True (negateInt# 2#) then
1085 -- odd number of slashes, " is escaped.
1086 doDiscard inStr (incLexeme buf)
1088 -- even number of slashes, \ is escaped.
1089 doDiscard (not inStr) (incLexeme buf)
1090 _ -> case inStr of -- forced to avoid build-up
1091 True -> doDiscard False (incLexeme buf)
1092 False -> doDiscard True (incLexeme buf)
1093 _ -> doDiscard inStr (incLexeme buf)
1097 -----------------------------------------------------------------------------
1108 data PState = PState {
1110 glasgow_exts :: Int#,
1113 context :: [LayoutContext]
1116 type P a = StringBuffer -- Input string
1121 returnP a buf s = POk s a
1123 thenP :: P a -> (a -> P b) -> P b
1124 m `thenP` k = \ buf s ->
1126 POk s1 a -> k a buf s1
1127 PFailed err -> PFailed err
1129 thenP_ :: P a -> P b -> P b
1130 m `thenP_` k = m `thenP` \_ -> k
1132 mapP :: (a -> P b) -> [a] -> P [b]
1133 mapP f [] = returnP []
1136 mapP f as `thenP` \bs ->
1139 failP :: String -> P a
1140 failP msg buf s = PFailed (text msg)
1142 failMsgP :: Message -> P a
1143 failMsgP msg buf s = PFailed msg
1145 lexError :: String -> P a
1146 lexError str buf s@PState{ loc = loc }
1147 = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1149 getSrcLocP :: P SrcLoc
1150 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1152 getSrcFile :: P FAST_STRING
1153 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1155 getContext :: P [LayoutContext]
1156 getContext buf s@(PState{ context = ctx }) = POk s ctx
1158 pushContext :: LayoutContext -> P ()
1159 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1163 This special case in layoutOn is to handle layout contexts with are
1164 indented the same or less than the current context. This is illegal
1165 according to the Haskell spec, so we have to arrange to close the
1166 current context. eg.
1171 after the first 'where', the sequence of events is:
1173 - layout system inserts a ';' (column 0)
1174 - parser begins a new context at column 0
1175 - parser shifts ';' (legal empty declaration)
1176 - parser sees 'class': parse error (we're still in the inner context)
1178 trouble is, by the time we know we need a new context, the lexer has
1179 already generated the ';'. Hacky solution is as follows: since we
1180 know the column of the next token (it's the column number of the new
1181 context), we set the ACTUAL column number of the new context to this
1182 numer plus one. Hence the next time the lexer is called, a '}' will
1183 be generated to close the new context straight away. Furthermore, we
1184 have to set the atbol flag so that the ';' that the parser shifted as
1185 part of the new context is re-generated.
1187 when the new context is *less* indented than the current one:
1189 f = f where g = g where
1192 - current context: column 12.
1193 - on seeing 'h' (column 0), the layout system inserts '}'
1194 - parser starts a new context, column 0
1195 - parser sees '}', uses it to close new context
1196 - we still need to insert another '}' followed by a ';',
1197 hence the atbol trick.
1202 layoutOn buf s@(PState{ bol = bol, context = ctx }) =
1203 let offset = lexemeIndex buf -# bol in
1205 Layout prev_off : _ | prev_off >=# offset ->
1206 --trace ("layout on, column: " ++ show (I# offset)) $
1207 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1209 --trace ("layout on, column: " ++ show (I# offset)) $
1210 POk s{ context = Layout offset : ctx } ()
1213 layoutOff buf s@(PState{ context = ctx }) =
1214 POk s{ context = NoLayout:ctx } ()
1217 popContext = \ buf s@(PState{ context = ctx }) ->
1219 (_:tl) -> POk s{ context = tl } ()
1220 [] -> panic "Lex.popContext: empty context"
1223 Note that if the name of the file we're processing ends
1224 with `hi-boot', we accept it on faith as having the right
1225 version. This is done so that .hi-boot files that comes
1226 with hsc don't have to be updated before every release,
1227 *and* it allows us to share .hi-boot files with versions
1228 of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1230 If the version number is 0, the checking is also turned off.
1231 (needed to deal with GHC.hi only!)
1233 Once we can assume we're compiling with a version of ghc that
1234 supports interface file checking, we can drop the special
1237 checkVersion :: Maybe Integer -> P ()
1238 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1239 | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1240 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1241 checkVersion mb@Nothing buf s@(PState{loc = loc})
1242 | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1243 | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1245 -----------------------------------------------------------------
1247 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1249 = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1250 text (lexemeToString s), char '\'']
1252 ifaceVersionErr hi_vers l toks
1253 = hsep [ppr l, ptext SLIT("Interface file version error;"),
1254 ptext SLIT("Expected"), int opt_HiVersion,
1255 ptext SLIT("found "), pp_version]
1259 Nothing -> ptext SLIT("pre ghc-3.02 version")
1260 Just v -> ptext SLIT("version") <+> integer v